433 lines
14 KiB
PHP
433 lines
14 KiB
PHP
<?php
|
|
|
|
/*
|
|
* To change this license header, choose License Headers in Project Properties.
|
|
* To change this template file, choose Tools | Templates
|
|
* and open the template in the editor.
|
|
*/
|
|
|
|
/**
|
|
* Description of citation
|
|
*
|
|
* @author may
|
|
*/
|
|
class citation {
|
|
// entry point to parse citations. Type is needed.
|
|
|
|
public static function parse ($citation, $type) {
|
|
$record = -1;
|
|
|
|
// check for DOI first
|
|
// skip for speed
|
|
|
|
$record = citation::_checkDOI($citation);
|
|
|
|
// we don't have DOI
|
|
if ($record == -1) {
|
|
|
|
// while type is hardcoded here due to obvious reasons,
|
|
// friendly names can indeed be changed in the DB
|
|
switch ($type) {
|
|
|
|
case 1:
|
|
// Journal article
|
|
$record = citation::_journal2arr($citation);
|
|
break;
|
|
|
|
case 2:
|
|
// Book
|
|
$record = citation::_book2arr($citation);
|
|
break;
|
|
|
|
case 3:
|
|
// Work paper
|
|
$record = citation::_workpaper2arr($citation);
|
|
break;
|
|
|
|
case 4:
|
|
// Conference paper
|
|
$record = citation::_conference2arr($citation);
|
|
break;
|
|
|
|
case 5:
|
|
// Report
|
|
$record = citation::_report2arr($citation);
|
|
break;
|
|
|
|
case 6:
|
|
// Thesis
|
|
$record = citation::_thesis2arr($citation);
|
|
break;
|
|
|
|
case 8:
|
|
// chapter
|
|
$record = citation::_chapter2arr($citation);
|
|
break;
|
|
|
|
|
|
default:
|
|
case 7: // project
|
|
case 9: // ESRA paper
|
|
case 10: // newspaper, magazine articles
|
|
case 11: // manuscripts
|
|
return -2;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return $record;
|
|
}
|
|
|
|
private static function _checkDOI ($citation) {
|
|
$_d = -1;
|
|
$doi = -1;
|
|
// check for DOI (must be at the end of citation!)
|
|
preg_match ('/(.*)doi:(.*)/i', $citation, $hits);
|
|
|
|
if (count($hits) == 3) {
|
|
$doi = trim($hits[2], " \n\r\t\v\0.");
|
|
}
|
|
|
|
preg_match ('/(.*)doi.org\/(.*)/i', $citation, $hits);
|
|
if (count($hits) == 3) {
|
|
$doi = trim($hits[2], " \n\r\t\v\0.");
|
|
}
|
|
|
|
if ($doi != -1) {
|
|
$_d = doi::fetchData($doi);
|
|
|
|
}
|
|
|
|
return $_d;
|
|
|
|
}
|
|
|
|
|
|
private static function _book2arr ($citation) {
|
|
$back = -1;
|
|
/*
|
|
* Authors (YEAR). TITLE. City: Publisher.
|
|
*/
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
|
|
.'[\.?]' //
|
|
.'(.*?[^\:]+)' // City
|
|
.'[\:]' // series OR dot.
|
|
.'(.*?[^\.]+)' // publisher
|
|
.'.*' // junk
|
|
.'/', $citation, $hits);
|
|
|
|
// If everything's OK, we get size of 6
|
|
if (count($hits)==6) {
|
|
$hits = citation::_trim ($hits);
|
|
|
|
$back = array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'city' => $hits[4],
|
|
'publisher' => $hits[5]);
|
|
|
|
if (strpos ($hits[3], '(')!==false) {
|
|
|
|
$tmp = explode ('(', $hits[3]);
|
|
|
|
$back['title'] = trim($tmp[0]);
|
|
$back['series'] = trim($tmp[1], " \(\)\n\r\t\{\}\,\.");
|
|
}
|
|
}
|
|
|
|
return $back;
|
|
}
|
|
|
|
private static function _trim ($arr) {
|
|
$back = array();
|
|
|
|
foreach ($arr as $offset=>$line) {
|
|
$line = trim($line, " \n\r\t\{\}\,\.");
|
|
$line = rtrim($line, "(");
|
|
$line = rtrim($line, " ");
|
|
$line = rtrim($line, ",");
|
|
$line = ltrim($line, ")");
|
|
$line = ltrim($line, " ");
|
|
$back[$offset] = ltrim($line, ",");
|
|
}
|
|
return $back;
|
|
}
|
|
|
|
private static function _journal2arr ($citation) {
|
|
|
|
/*
|
|
// Authors (year). title. journal, i(v), pages.
|
|
*
|
|
*/
|
|
if (substr ($citation, -1)!='.') {
|
|
$citation .= '.';
|
|
}
|
|
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
|
.'[\.?]'
|
|
.'(.*?[^0-9]+)' // journal (some have commas, so until ( or digit (pages, volume,...)
|
|
.'(.*?[^,]+)' // vol/issue (match until comma)
|
|
.'\,'
|
|
.'(.*?[^\.\,]+)' // pages (match against , or .)
|
|
.'(\.|,)' // check for size
|
|
.'(.*)' // check for size
|
|
.'/', $citation, $hits);
|
|
|
|
if (count($hits) == 9) {
|
|
|
|
$hits = citation::_trim($hits);
|
|
|
|
return array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'journal' => $hits[4],
|
|
'issue' => $hits[5],
|
|
'pages' => $hits[6]
|
|
);
|
|
}
|
|
|
|
else {
|
|
|
|
// plan B:
|
|
// retrieve authors, year, title, journal and leave the rest.
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
|
|
.'[\.?]'
|
|
.'(.*?[^.]+|.*?[^,]+)' // journal in form 'title.' or 'title?'
|
|
.'[\.,]'
|
|
.'(.*)' // check for size
|
|
.'/', $citation, $hits);
|
|
|
|
if (count($hits) == 6 && trim($hits[4]!="")) {
|
|
|
|
$hits = citation::_trim($hits);
|
|
|
|
return array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'journal' => $hits[4]
|
|
);
|
|
}
|
|
else {
|
|
echo "\n***Journal article not recognised***\n";
|
|
print_r ($hits);
|
|
|
|
return -1;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
private static function _workpaper2arr ($citation) {
|
|
|
|
/*
|
|
* This is problematic, as there are many different forms (citation styles).
|
|
* We're taking Authors (Year). Title. Series. publisher
|
|
*/
|
|
|
|
|
|
// let's just take out fields 1-3 (authors, year, title (whatever it is)
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
|
.'[\.?]'
|
|
.'(.*?[^\.]+)' // series
|
|
.'\.'
|
|
/* .'(.*?[^\.]+)' // publisher
|
|
.'[\.]' // check for size */
|
|
.'(.*)' // check for size
|
|
.'/', $citation, $hits);
|
|
|
|
$hits = citation::_trim($hits);
|
|
|
|
// field 3 can be broken. Manual fix needed.
|
|
return array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3]
|
|
);
|
|
}
|
|
|
|
|
|
|
|
private static function _conference2arr ($citation) {
|
|
|
|
/*
|
|
* This is problematic, as there are many different forms (citation styles).
|
|
* We're taking Authors (Year). Title. - same as working paper
|
|
*/
|
|
|
|
|
|
|
|
// let's just take out fields 1-3 (authors, year, title (whatever it is)
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
|
|
.'[\.?]'
|
|
.'(.*) for the' //
|
|
.'(.*?[^,]+)'
|
|
.','
|
|
.'(.*)' // check for size
|
|
.'/', $citation, $hits);
|
|
|
|
$hits = citation::_trim($hits);
|
|
|
|
// field 3 can be broken. Manual fix needed.
|
|
return array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'conference' => $hits[5],
|
|
'city' => $hits[6]
|
|
);
|
|
}
|
|
|
|
|
|
|
|
private static function _report2arr ($citation) {
|
|
// this is a mess.
|
|
// it is most often AUTHORS (YEAR).
|
|
// but what's next is... random.
|
|
// title. title. publisher.
|
|
// or
|
|
// title. series. publisher.
|
|
// or
|
|
// title. title. series. publisher.
|
|
// or title in series ....
|
|
// let's just take authors, year, [*], publisher
|
|
|
|
$back = -1;
|
|
/*
|
|
* Authors (YEAR). TITLE. REPORT_TYPE. Publisher.
|
|
*/
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.])' // title in form 'title.' or 'title?'
|
|
.'\.'
|
|
.'(?:(.*?[^\.]+)\.(.*?[^\.]+))|(.*?[^\.]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
|
|
.'\.'
|
|
.'(.*)' // rest
|
|
.'/', $citation, $hits);
|
|
|
|
|
|
// If everything's OK, we get size of 6
|
|
if (count($hits)==6) {
|
|
$hits = citation::_trim ($hits);
|
|
|
|
$back = array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'publisher' => $hits[4]);
|
|
|
|
}
|
|
|
|
return $back;
|
|
}
|
|
|
|
private static function _thesis2arr ($citation) {
|
|
|
|
|
|
$back = -1;
|
|
/*
|
|
*
|
|
* Authors (YEAR). TITLE. TYPE. PUBLISHER.
|
|
*/
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\).' // ).
|
|
.'(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
|
|
.'[\.?]'
|
|
.'(.*?[^.]+|.*?[^,]+)' // type
|
|
.'[\.\,]'
|
|
.'(.*?[^.]+)' // location
|
|
.'\.'
|
|
.'(.*)' // rest
|
|
.'/', $citation, $hits);
|
|
|
|
// If everything's OK, we get size of 6
|
|
if (count($hits)==7) {
|
|
$hits = citation::_trim ($hits);
|
|
|
|
$back = array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'thesis_type' => $hits[4],
|
|
'publisher' => $hits[5]);
|
|
|
|
}
|
|
|
|
return $back;
|
|
}
|
|
|
|
|
|
private static function _chapter2arr ($citation) {
|
|
|
|
|
|
$back = -1;
|
|
/*
|
|
|
|
|
|
* Authors (YEAR). TITLE. WHERE (pp) publisher)
|
|
*/
|
|
preg_match ('/(.*?[^(]+)' // authors - before (
|
|
. '\(' // (
|
|
.'([0-9]?[^)]+)' // year - digits before )
|
|
. '\)\.' // ).
|
|
.'(.*[^.?]+)\bin ' // title in form 'title. IN' or 'title? IN'
|
|
// .'([\.?] in )'
|
|
.'(.*?[^)]+)' // book author before )
|
|
. '\)' // ).
|
|
// .'(.*)\(pp' // book
|
|
.'(.*)\bpp' // book
|
|
.'(.*?[^). ]+)' // pps
|
|
. '\b[.,]' // ).
|
|
.'(.*)' // publisher
|
|
.'/i', $citation, $hits);
|
|
|
|
// If everything's OK, we get size of 6
|
|
if (count($hits)==8) {
|
|
$hits = citation::_trim ($hits);
|
|
|
|
$back = array (
|
|
'author' => $hits[1],
|
|
'year' => $hits[2],
|
|
'title' => $hits[3],
|
|
'book_author' => trim(preg_replace ('/(.*?[^(]+)(.*)/', '$1',$hits[4])),
|
|
'booktitle' => trim ($hits[5], " \n\r\t\v\0("),
|
|
'pages' => $hits[6],
|
|
'location' => $hits[7],
|
|
'publisher' => $hits[8]);
|
|
|
|
}
|
|
else {
|
|
print_r ($hits);
|
|
}
|
|
|
|
return $back;
|
|
}
|
|
}
|
|
|