2023-01-24 19:00:39 +01:00
< ? php
/*
* To change this license header , choose License Headers in Project Properties .
* To change this template file , choose Tools | Templates
* and open the template in the editor .
*/
/**
* Description of citation
*
* @ author may
*/
class citation {
// entry point to parse citations. Type is needed.
public static function parse ( $citation , $type ) {
$record = - 1 ;
// check for DOI first
// skip for speed
2023-03-08 21:07:47 +01:00
2023-01-24 19:00:39 +01:00
$record = citation :: _checkDOI ( $citation );
// we don't have DOI
if ( $record == - 1 ) {
// while type is hardcoded here due to obvious reasons,
// friendly names can indeed be changed in the DB
switch ( $type ) {
case 1 :
// Journal article
$record = citation :: _journal2arr ( $citation );
break ;
case 2 :
// Book
$record = citation :: _book2arr ( $citation );
break ;
case 3 :
// Work paper
$record = citation :: _workpaper2arr ( $citation );
break ;
case 4 :
// Conference paper
$record = citation :: _conference2arr ( $citation );
break ;
case 5 :
// Report
$record = citation :: _report2arr ( $citation );
break ;
case 6 :
// Thesis
$record = citation :: _thesis2arr ( $citation );
break ;
case 8 :
// chapter
$record = citation :: _chapter2arr ( $citation );
break ;
default :
case 7 : // project
case 9 : // ESRA paper
case 10 : // newspaper, magazine articles
case 11 : // manuscripts
return - 2 ;
break ;
}
}
return $record ;
}
private static function _checkDOI ( $citation ) {
$_d = - 1 ;
$doi = - 1 ;
// check for DOI (must be at the end of citation!)
preg_match ( '/(.*)doi:(.*)/i' , $citation , $hits );
if ( count ( $hits ) == 3 ) {
2023-03-08 21:07:47 +01:00
$doi = trim ( $hits [ 2 ], " \n \r \t \ v \0 . " );
2023-01-24 19:00:39 +01:00
}
2023-03-08 21:07:47 +01:00
preg_match ( '/(.*)doi.org\/(.*)/i' , $citation , $hits );
2023-01-24 19:00:39 +01:00
if ( count ( $hits ) == 3 ) {
$doi = trim ( $hits [ 2 ], " \n \r \t \ v \0 . " );
}
if ( $doi != - 1 ) {
$_d = doi :: fetchData ( $doi );
2023-03-08 21:07:47 +01:00
}
2023-01-24 19:00:39 +01:00
return $_d ;
}
private static function _book2arr ( $citation ) {
$back = - 1 ;
/*
* Authors ( YEAR ) . TITLE . City : Publisher .
*/
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
. '[\.?]' //
. '(.*?[^\:]+)' // City
. '[\:]' // series OR dot.
. '(.*?[^\.]+)' // publisher
. '.*' // junk
. '/' , $citation , $hits );
// If everything's OK, we get size of 6
if ( count ( $hits ) == 6 ) {
$hits = citation :: _trim ( $hits );
$back = array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'city' => $hits [ 4 ],
'publisher' => $hits [ 5 ]);
if ( strpos ( $hits [ 3 ], '(' ) !== false ) {
$tmp = explode ( '(' , $hits [ 3 ]);
$back [ 'title' ] = trim ( $tmp [ 0 ]);
$back [ 'series' ] = trim ( $tmp [ 1 ], " \ ( \ ) \n \r \t \ { \ } \ , \ . " );
}
}
return $back ;
}
private static function _trim ( $arr ) {
$back = array ();
foreach ( $arr as $offset => $line ) {
$line = trim ( $line , " \n \r \t \ { \ } \ , \ . " );
$line = rtrim ( $line , " ( " );
$line = rtrim ( $line , " " );
$line = rtrim ( $line , " , " );
$line = ltrim ( $line , " ) " );
$line = ltrim ( $line , " " );
$back [ $offset ] = ltrim ( $line , " , " );
}
return $back ;
}
private static function _journal2arr ( $citation ) {
/*
// Authors (year). title. journal, i(v), pages.
*
*/
if ( substr ( $citation , - 1 ) != '.' ) {
$citation .= '.' ;
}
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
. '[\.?]'
. '(.*?[^0-9]+)' // journal (some have commas, so until ( or digit (pages, volume,...)
. '(.*?[^,]+)' // vol/issue (match until comma)
. '\,'
. '(.*?[^\.\,]+)' // pages (match against , or .)
. '(\.|,)' // check for size
. '(.*)' // check for size
. '/' , $citation , $hits );
if ( count ( $hits ) == 9 ) {
$hits = citation :: _trim ( $hits );
return array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'journal' => $hits [ 4 ],
'issue' => $hits [ 5 ],
'pages' => $hits [ 6 ]
);
}
else {
// plan B:
// retrieve authors, year, title, journal and leave the rest.
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
. '[\.?]'
. '(.*?[^.]+|.*?[^,]+)' // journal in form 'title.' or 'title?'
. '[\.,]'
. '(.*)' // check for size
. '/' , $citation , $hits );
if ( count ( $hits ) == 6 && trim ( $hits [ 4 ] != " " )) {
$hits = citation :: _trim ( $hits );
return array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'journal' => $hits [ 4 ]
);
}
else {
echo " \n ***Journal article not recognised*** \n " ;
print_r ( $hits );
return - 1 ;
}
}
}
private static function _workpaper2arr ( $citation ) {
/*
* This is problematic , as there are many different forms ( citation styles ) .
* We ' re taking Authors ( Year ) . Title . Series . publisher
*/
// let's just take out fields 1-3 (authors, year, title (whatever it is)
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
. '[\.?]'
. '(.*?[^\.]+)' // series
. '\.'
/* . '(.*?[^\.]+)' // publisher
. '[\.]' // check for size */
. '(.*)' // check for size
. '/' , $citation , $hits );
$hits = citation :: _trim ( $hits );
// field 3 can be broken. Manual fix needed.
return array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ]
);
}
private static function _conference2arr ( $citation ) {
/*
* This is problematic , as there are many different forms ( citation styles ) .
* We ' re taking Authors ( Year ) . Title . - same as working paper
*/
// let's just take out fields 1-3 (authors, year, title (whatever it is)
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*[^.]+|.*[^?]+)' // title in form 'title.' or 'title?'
. '[\.?]'
. '(.*) for the' //
. '(.*?[^,]+)'
. ','
. '(.*)' // check for size
. '/' , $citation , $hits );
$hits = citation :: _trim ( $hits );
// field 3 can be broken. Manual fix needed.
return array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'conference' => $hits [ 5 ],
'city' => $hits [ 6 ]
);
}
private static function _report2arr ( $citation ) {
// this is a mess.
// it is most often AUTHORS (YEAR).
// but what's next is... random.
// title. title. publisher.
// or
// title. series. publisher.
// or
// title. title. series. publisher.
// or title in series ....
// let's just take authors, year, [*], publisher
$back = - 1 ;
/*
* Authors ( YEAR ) . TITLE . REPORT_TYPE . Publisher .
*/
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
. '(.*[^.])' // title in form 'title.' or 'title?'
. '\.'
. '(?:(.*?[^\.]+)\.(.*?[^\.]+))|(.*?[^\.]+)' // title in form 'title.' or 'title?' OR 'title (series)'. Two dots are a problem, sugest changing it to : .
. '\.'
. '(.*)' // rest
. '/' , $citation , $hits );
// If everything's OK, we get size of 6
if ( count ( $hits ) == 6 ) {
$hits = citation :: _trim ( $hits );
$back = array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'publisher' => $hits [ 4 ]);
}
return $back ;
}
private static function _thesis2arr ( $citation ) {
$back = - 1 ;
/*
*
* Authors ( YEAR ) . TITLE . TYPE . PUBLISHER .
*/
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
2023-03-08 21:07:47 +01:00
. '\).' // ).
. '(.*?[^.]+|.*?[^?]+)' // title in form 'title.' or 'title?'
2023-01-24 19:00:39 +01:00
. '[\.?]'
2023-03-08 21:07:47 +01:00
. '(.*?[^.]+|.*?[^,]+)' // type
2023-01-24 19:00:39 +01:00
. '[\.\,]'
2023-03-08 21:07:47 +01:00
. '(.*?[^.]+)' // location
2023-01-24 19:00:39 +01:00
. '\.'
. '(.*)' // rest
. '/' , $citation , $hits );
// If everything's OK, we get size of 6
if ( count ( $hits ) == 7 ) {
$hits = citation :: _trim ( $hits );
$back = array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'thesis_type' => $hits [ 4 ],
'publisher' => $hits [ 5 ]);
}
return $back ;
}
private static function _chapter2arr ( $citation ) {
$back = - 1 ;
/*
* Authors ( YEAR ) . TITLE . WHERE ( pp ) publisher )
*/
preg_match ( '/(.*?[^(]+)' // authors - before (
. '\(' // (
. '([0-9]?[^)]+)' // year - digits before )
. '\)\.' // ).
2023-03-08 21:07:47 +01:00
. '(.*[^.?]+)\bin ' // title in form 'title. IN' or 'title? IN'
2023-01-24 19:00:39 +01:00
// .'([\.?] in )'
. '(.*?[^)]+)' // book author before )
. '\)' // ).
// .'(.*)\(pp' // book
. '(.*)\bpp' // book
2023-03-08 21:07:47 +01:00
. '(.*?[^). ]+)' // pps
. '\b[.,]' // ).
2023-01-24 19:00:39 +01:00
. '(.*)' // publisher
. '/i' , $citation , $hits );
// If everything's OK, we get size of 6
2023-03-08 21:07:47 +01:00
if ( count ( $hits ) == 8 ) {
2023-01-24 19:00:39 +01:00
$hits = citation :: _trim ( $hits );
$back = array (
'author' => $hits [ 1 ],
'year' => $hits [ 2 ],
'title' => $hits [ 3 ],
'book_author' => trim ( preg_replace ( '/(.*?[^(]+)(.*)/' , '$1' , $hits [ 4 ])),
'booktitle' => trim ( $hits [ 5 ], " \n \r \t \ v \0 ( " ),
'pages' => $hits [ 6 ],
'location' => $hits [ 7 ],
'publisher' => $hits [ 8 ]);
}
else {
print_r ( $hits );
}
return $back ;
}
}