434 lines
14 KiB
PHP
434 lines
14 KiB
PHP
![]() |
<?php
|
||
|
|
||
|
/*
|
||
|
* To change this license header, choose License Headers in Project Properties.
|
||
|
* To change this template file, choose Tools | Templates
|
||
|
* and open the template in the editor.
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* Description of citation
|
||
|
*
|
||
|
* @author may
|
||
|
*/
|
||
|
class citation {
|
||
|
// entry point to parse citations. Type is needed.
|
||
|
|
||
|
public static function parse ($citation, $type) {
|
||
|
$record = -1;
|
||
|
|
||
|
// check for DOI first
|
||
|
// skip for speed
|
||
|
$record = citation::_checkDOI($citation);
|
||
|
|
||
|
// we don't have DOI
|
||
|
if ($record == -1) {
|
||
|
|
||
|
// while type is hardcoded here due to obvious reasons,
|
||
|
// friendly names can indeed be changed in the DB
|
||
|
switch ($type) {
|
||
|
|
||
|
case 1:
|
||
|
// Journal article
|
||
|
$record = citation::_journal2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 2:
|
||
|
// Book
|
||
|
$record = citation::_book2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 3:
|
||
|
// Work paper
|
||
|
$record = citation::_workpaper2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 4:
|
||
|
// Conference paper
|
||
|
$record = citation::_conference2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 5:
|
||
|
// Report
|
||
|
$record = citation::_report2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 6:
|
||
|
// Thesis
|
||
|
$record = citation::_thesis2arr($citation);
|
||
|
break;
|
||
|
|
||
|
case 8:
|
||
|
// chapter
|
||
|
$record = citation::_chapter2arr($citation);
|
||
|
break;
|
||
|
|
||
|
|
||
|
default:
|
||
|
case 7: // project
|
||
|
case 9: // ESRA paper
|
||
|
case 10: // newspaper, magazine articles
|
||
|
case 11: // manuscripts
|
||
|
return -2;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $record;
|
||
|
}
|
||
|
|
||
|
private static function _checkDOI ($citation) {
|
||
|
$_d = -1;
|
||
|
$doi = -1;
|
||
|
// check for DOI (must be at the end of citation!)
|
||
|
preg_match ('/(.*)doi:(.*)/i', $citation, $hits);
|
||
|
|
||
|
if (count($hits) == 3) {
|
||
|
$doi = trim($hits[2], " \n\r\t\v\0.");
|
||
|
}
|
||
|
preg_match ('/(.*)doi.org\/(.*)/', $citation, $hits);
|
||
|
if (count($hits) == 3) {
|
||
|
$doi = trim($hits[2], " \n\r\t\v\0.");
|
||
|
}
|
||
|
|
||
|
if ($doi != -1) {
|
||
|
$_d = doi::fetchData($doi);
|
||
|
}
|
||
|
|
||
|
|
||
|
return $_d;
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
private static function _book2arr ($citation) {
|
||
|
$back = -1;
|
||
|
/*
|
||
|
* Authors (YEAR). TITLE. City: Publisher.
|
||
|
*/
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
|
||
|
.'[\.?]' //
|
||
|
.'(.*?[^\:]+)' // City
|
||
|
.'[\:]' // series OR dot.
|
||
|
.'(.*?[^\.]+)' // publisher
|
||
|
.'.*' // junk
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
// If everything's OK, we get size of 6
|
||
|
if (count($hits)==6) {
|
||
|
$hits = citation::_trim ($hits);
|
||
|
|
||
|
$back = array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'city' => $hits[4],
|
||
|
'publisher' => $hits[5]);
|
||
|
|
||
|
if (strpos ($hits[3], '(')!==false) {
|
||
|
|
||
|
$tmp = explode ('(', $hits[3]);
|
||
|
|
||
|
$back['title'] = trim($tmp[0]);
|
||
|
$back['series'] = trim($tmp[1], " \(\)\n\r\t\{\}\,\.");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $back;
|
||
|
}
|
||
|
|
||
|
private static function _trim ($arr) {
|
||
|
$back = array();
|
||
|
|
||
|
foreach ($arr as $offset=>$line) {
|
||
|
$line = trim($line, " \n\r\t\{\}\,\.");
|
||
|
$line = rtrim($line, "(");
|
||
|
$line = rtrim($line, " ");
|
||
|
$line = rtrim($line, ",");
|
||
|
$line = ltrim($line, ")");
|
||
|
$line = ltrim($line, " ");
|
||
|
$back[$offset] = ltrim($line, ",");
|
||
|
}
|
||
|
return $back;
|
||
|
}
|
||
|
|
||
|
private static function _journal2arr ($citation) {
|
||
|
|
||
|
/*
|
||
|
// Authors (year). title. journal, i(v), pages.
|
||
|
*
|
||
|
*/
|
||
|
if (substr ($citation, -1)!='.') {
|
||
|
$citation .= '.';
|
||
|
}
|
||
|
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
||
|
.'[\.?]'
|
||
|
.'(.*?[^0-9]+)' // journal (some have commas, so until ( or digit (pages, volume,...)
|
||
|
.'(.*?[^,]+)' // vol/issue (match until comma)
|
||
|
.'\,'
|
||
|
.'(.*?[^\.\,]+)' // pages (match against , or .)
|
||
|
.'(\.|,)' // check for size
|
||
|
.'(.*)' // check for size
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
if (count($hits) == 9) {
|
||
|
|
||
|
$hits = citation::_trim($hits);
|
||
|
|
||
|
return array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'journal' => $hits[4],
|
||
|
'issue' => $hits[5],
|
||
|
'pages' => $hits[6]
|
||
|
);
|
||
|
}
|
||
|
|
||
|
else {
|
||
|
|
||
|
// plan B:
|
||
|
// retrieve authors, year, title, journal and leave the rest.
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
|
||
|
.'[\.?]'
|
||
|
.'(.*?[^.]+|.*?[^,]+)' // journal in form 'title.' or 'title?'
|
||
|
.'[\.,]'
|
||
|
.'(.*)' // check for size
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
if (count($hits) == 6 && trim($hits[4]!="")) {
|
||
|
|
||
|
$hits = citation::_trim($hits);
|
||
|
|
||
|
return array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'journal' => $hits[4]
|
||
|
);
|
||
|
}
|
||
|
else {
|
||
|
echo "\n***Journal article not recognised***\n";
|
||
|
print_r ($hits);
|
||
|
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
private static function _workpaper2arr ($citation) {
|
||
|
|
||
|
/*
|
||
|
* This is problematic, as there are many different forms (citation styles).
|
||
|
* We're taking Authors (Year). Title. Series. publisher
|
||
|
*/
|
||
|
|
||
|
|
||
|
// let's just take out fields 1-3 (authors, year, title (whatever it is)
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
||
|
.'[\.?]'
|
||
|
.'(.*?[^\.]+)' // series
|
||
|
.'\.'
|
||
|
/* .'(.*?[^\.]+)' // publisher
|
||
|
.'[\.]' // check for size */
|
||
|
.'(.*)' // check for size
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
$hits = citation::_trim($hits);
|
||
|
|
||
|
// field 3 can be broken. Manual fix needed.
|
||
|
return array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3]
|
||
|
);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
private static function _conference2arr ($citation) {
|
||
|
|
||
|
/*
|
||
|
* This is problematic, as there are many different forms (citation styles).
|
||
|
* We're taking Authors (Year). Title. - same as working paper
|
||
|
*/
|
||
|
|
||
|
|
||
|
|
||
|
// let's just take out fields 1-3 (authors, year, title (whatever it is)
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
||
|
.'[\.?]'
|
||
|
.'(.*) for the' //
|
||
|
.'(.*?[^,]+)'
|
||
|
.','
|
||
|
.'(.*)' // check for size
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
$hits = citation::_trim($hits);
|
||
|
|
||
|
// field 3 can be broken. Manual fix needed.
|
||
|
return array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'conference' => $hits[5],
|
||
|
'city' => $hits[6]
|
||
|
);
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
private static function _report2arr ($citation) {
|
||
|
// this is a mess.
|
||
|
// it is most often AUTHORS (YEAR).
|
||
|
// but what's next is... random.
|
||
|
// title. title. publisher.
|
||
|
// or
|
||
|
// title. series. publisher.
|
||
|
// or
|
||
|
// title. title. series. publisher.
|
||
|
// or title in series ....
|
||
|
// let's just take authors, year, [*], publisher
|
||
|
|
||
|
$back = -1;
|
||
|
/*
|
||
|
* Authors (YEAR). TITLE. REPORT_TYPE. Publisher.
|
||
|
*/
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.])' // title in form 'title.' or 'title?'
|
||
|
.'\.'
|
||
|
.'(?:(.*?[^\.]+)\.(.*?[^\.]+))|(.*?[^\.]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
|
||
|
.'\.'
|
||
|
.'(.*)' // rest
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
|
||
|
// If everything's OK, we get size of 6
|
||
|
if (count($hits)==6) {
|
||
|
$hits = citation::_trim ($hits);
|
||
|
|
||
|
$back = array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'publisher' => $hits[4]);
|
||
|
|
||
|
}
|
||
|
|
||
|
return $back;
|
||
|
}
|
||
|
|
||
|
private static function _thesis2arr ($citation) {
|
||
|
|
||
|
|
||
|
$back = -1;
|
||
|
/*
|
||
|
*
|
||
|
* Authors (YEAR). TITLE. TYPE. PUBLISHER.
|
||
|
*/
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
||
|
.'[\.?]'
|
||
|
.'(.*[^.]+|.*[^,]+)' // type
|
||
|
.'[\.\,]'
|
||
|
.'(.*[^.]+)' // location
|
||
|
.'\.'
|
||
|
.'(.*)' // rest
|
||
|
.'/', $citation, $hits);
|
||
|
|
||
|
// If everything's OK, we get size of 6
|
||
|
if (count($hits)==7) {
|
||
|
$hits = citation::_trim ($hits);
|
||
|
|
||
|
$back = array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'thesis_type' => $hits[4],
|
||
|
'publisher' => $hits[5]);
|
||
|
|
||
|
}
|
||
|
|
||
|
return $back;
|
||
|
}
|
||
|
|
||
|
|
||
|
private static function _chapter2arr ($citation) {
|
||
|
|
||
|
|
||
|
$back = -1;
|
||
|
/*
|
||
|
|
||
|
|
||
|
* Authors (YEAR). TITLE. WHERE (pp) publisher)
|
||
|
*/
|
||
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
||
|
. '\(' // (
|
||
|
.'([0-9]?[^)]+)' // year - digits before )
|
||
|
. '\)\.' // ).
|
||
|
.'(.*[^.]+|.*[^?]+) in ' // title in form 'title. IN' or 'title? IN'
|
||
|
// .'([\.?] in )'
|
||
|
.'(.*?[^)]+)' // book author before )
|
||
|
. '\)' // ).
|
||
|
// .'(.*)\(pp' // book
|
||
|
.'(.*)\bpp' // book
|
||
|
.'(.*?[^)]+)' // pps
|
||
|
. '\)[\.\,]' // ).
|
||
|
.'(.*[^\:]+)' // location
|
||
|
.'\:'
|
||
|
.'(.*)' // publisher
|
||
|
.'/i', $citation, $hits);
|
||
|
|
||
|
// If everything's OK, we get size of 6
|
||
|
if (count($hits)==9) {
|
||
|
$hits = citation::_trim ($hits);
|
||
|
|
||
|
$back = array (
|
||
|
'author' => $hits[1],
|
||
|
'year' => $hits[2],
|
||
|
'title' => $hits[3],
|
||
|
'book_author' => trim(preg_replace ('/(.*?[^(]+)(.*)/', '$1',$hits[4])),
|
||
|
'booktitle' => trim ($hits[5], " \n\r\t\v\0("),
|
||
|
'pages' => $hits[6],
|
||
|
'location' => $hits[7],
|
||
|
'publisher' => $hits[8]);
|
||
|
|
||
|
}
|
||
|
else {
|
||
|
|
||
|
print_r ($hits);
|
||
|
}
|
||
|
|
||
|
return $back;
|
||
|
}
|
||
|
}
|
||
|
|