2023-01-24 19:00:39 +01:00

434 lines
14 KiB
PHP

<?php
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
/**
* Description of citation
*
* @author may
*/
class citation {
// entry point to parse citations. Type is needed.
public static function parse ($citation, $type) {
$record = -1;
// check for DOI first
// skip for speed
$record = citation::_checkDOI($citation);
// we don't have DOI
if ($record == -1) {
// while type is hardcoded here due to obvious reasons,
// friendly names can indeed be changed in the DB
switch ($type) {
case 1:
// Journal article
$record = citation::_journal2arr($citation);
break;
case 2:
// Book
$record = citation::_book2arr($citation);
break;
case 3:
// Work paper
$record = citation::_workpaper2arr($citation);
break;
case 4:
// Conference paper
$record = citation::_conference2arr($citation);
break;
case 5:
// Report
$record = citation::_report2arr($citation);
break;
case 6:
// Thesis
$record = citation::_thesis2arr($citation);
break;
case 8:
// chapter
$record = citation::_chapter2arr($citation);
break;
default:
case 7: // project
case 9: // ESRA paper
case 10: // newspaper, magazine articles
case 11: // manuscripts
return -2;
break;
}
}
return $record;
}
private static function _checkDOI ($citation) {
$_d = -1;
$doi = -1;
// check for DOI (must be at the end of citation!)
preg_match ('/(.*)doi:(.*)/i', $citation, $hits);
if (count($hits) == 3) {
$doi = trim($hits[2], " \n\r\t\v\0.");
}
preg_match ('/(.*)doi.org\/(.*)/', $citation, $hits);
if (count($hits) == 3) {
$doi = trim($hits[2], " \n\r\t\v\0.");
}
if ($doi != -1) {
$_d = doi::fetchData($doi);
}
return $_d;
}
private static function _book2arr ($citation) {
$back = -1;
/*
* Authors (YEAR). TITLE. City: Publisher.
*/
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
.'[\.?]' //
.'(.*?[^\:]+)' // City
.'[\:]' // series OR dot.
.'(.*?[^\.]+)' // publisher
.'.*' // junk
.'/', $citation, $hits);
// If everything's OK, we get size of 6
if (count($hits)==6) {
$hits = citation::_trim ($hits);
$back = array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'city' => $hits[4],
'publisher' => $hits[5]);
if (strpos ($hits[3], '(')!==false) {
$tmp = explode ('(', $hits[3]);
$back['title'] = trim($tmp[0]);
$back['series'] = trim($tmp[1], " \(\)\n\r\t\{\}\,\.");
}
}
return $back;
}
private static function _trim ($arr) {
$back = array();
foreach ($arr as $offset=>$line) {
$line = trim($line, " \n\r\t\{\}\,\.");
$line = rtrim($line, "(");
$line = rtrim($line, " ");
$line = rtrim($line, ",");
$line = ltrim($line, ")");
$line = ltrim($line, " ");
$back[$offset] = ltrim($line, ",");
}
return $back;
}
private static function _journal2arr ($citation) {
/*
// Authors (year). title. journal, i(v), pages.
*
*/
if (substr ($citation, -1)!='.') {
$citation .= '.';
}
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
.'[\.?]'
.'(.*?[^0-9]+)' // journal (some have commas, so until ( or digit (pages, volume,...)
.'(.*?[^,]+)' // vol/issue (match until comma)
.'\,'
.'(.*?[^\.\,]+)' // pages (match against , or .)
.'(\.|,)' // check for size
.'(.*)' // check for size
.'/', $citation, $hits);
if (count($hits) == 9) {
$hits = citation::_trim($hits);
return array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'journal' => $hits[4],
'issue' => $hits[5],
'pages' => $hits[6]
);
}
else {
// plan B:
// retrieve authors, year, title, journal and leave the rest.
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
.'[\.?]'
.'(.*?[^.]+|.*?[^,]+)' // journal in form 'title.' or 'title?'
.'[\.,]'
.'(.*)' // check for size
.'/', $citation, $hits);
if (count($hits) == 6 && trim($hits[4]!="")) {
$hits = citation::_trim($hits);
return array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'journal' => $hits[4]
);
}
else {
echo "\n***Journal article not recognised***\n";
print_r ($hits);
return -1;
}
}
}
private static function _workpaper2arr ($citation) {
/*
* This is problematic, as there are many different forms (citation styles).
* We're taking Authors (Year). Title. Series. publisher
*/
// let's just take out fields 1-3 (authors, year, title (whatever it is)
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
.'[\.?]'
.'(.*?[^\.]+)' // series
.'\.'
/* .'(.*?[^\.]+)' // publisher
.'[\.]' // check for size */
.'(.*)' // check for size
.'/', $citation, $hits);
$hits = citation::_trim($hits);
// field 3 can be broken. Manual fix needed.
return array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3]
);
}
private static function _conference2arr ($citation) {
/*
* This is problematic, as there are many different forms (citation styles).
* We're taking Authors (Year). Title. - same as working paper
*/
// let's just take out fields 1-3 (authors, year, title (whatever it is)
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
.'[\.?]'
.'(.*) for the' //
.'(.*?[^,]+)'
.','
.'(.*)' // check for size
.'/', $citation, $hits);
$hits = citation::_trim($hits);
// field 3 can be broken. Manual fix needed.
return array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'conference' => $hits[5],
'city' => $hits[6]
);
}
private static function _report2arr ($citation) {
// this is a mess.
// it is most often AUTHORS (YEAR).
// but what's next is... random.
// title. title. publisher.
// or
// title. series. publisher.
// or
// title. title. series. publisher.
// or title in series ....
// let's just take authors, year, [*], publisher
$back = -1;
/*
* Authors (YEAR). TITLE. REPORT_TYPE. Publisher.
*/
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.])' // title in form 'title.' or 'title?'
.'\.'
.'(?:(.*?[^\.]+)\.(.*?[^\.]+))|(.*?[^\.]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
.'\.'
.'(.*)' // rest
.'/', $citation, $hits);
// If everything's OK, we get size of 6
if (count($hits)==6) {
$hits = citation::_trim ($hits);
$back = array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'publisher' => $hits[4]);
}
return $back;
}
private static function _thesis2arr ($citation) {
$back = -1;
/*
*
* Authors (YEAR). TITLE. TYPE. PUBLISHER.
*/
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
.'[\.?]'
.'(.*[^.]+|.*[^,]+)' // type
.'[\.\,]'
.'(.*[^.]+)' // location
.'\.'
.'(.*)' // rest
.'/', $citation, $hits);
// If everything's OK, we get size of 6
if (count($hits)==7) {
$hits = citation::_trim ($hits);
$back = array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'thesis_type' => $hits[4],
'publisher' => $hits[5]);
}
return $back;
}
private static function _chapter2arr ($citation) {
$back = -1;
/*
* Authors (YEAR). TITLE. WHERE (pp) publisher)
*/
preg_match ('/(.*?[^(]+)' // authors - before (
. '\(' // (
.'([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
.'(.*[^.]+|.*[^?]+) in ' // title in form 'title. IN' or 'title? IN'
// .'([\.?] in )'
.'(.*?[^)]+)' // book author before )
. '\)' // ).
// .'(.*)\(pp' // book
.'(.*)\bpp' // book
.'(.*?[^)]+)' // pps
. '\)[\.\,]' // ).
.'(.*[^\:]+)' // location
.'\:'
.'(.*)' // publisher
.'/i', $citation, $hits);
// If everything's OK, we get size of 6
if (count($hits)==9) {
$hits = citation::_trim ($hits);
$back = array (
'author' => $hits[1],
'year' => $hits[2],
'title' => $hits[3],
'book_author' => trim(preg_replace ('/(.*?[^(]+)(.*)/', '$1',$hits[4])),
'booktitle' => trim ($hits[5], " \n\r\t\v\0("),
'pages' => $hits[6],
'location' => $hits[7],
'publisher' => $hits[8]);
}
else {
print_r ($hits);
}
return $back;
}
}