1<?php 2/* 3Inspired by an awk BibTeX parser written by Nelson H. F. Beebe over 20 years ago although little of that 4remains other than a highly edited braceCount(). 5 6Released through http://bibliophile.sourceforge.net under the GPL licence. 7Do whatever you like with this -- some credit to the author(s) would be appreciated. 8 9A collection of PHP classes to manipulate bibtex files. 10 11If you make improvements, please consider contacting the administrators at bibliophile.sourceforge.net so that your improvements can be added to the release package. 12 13Mark Grimshaw 2005 14http://bibliophile.sourceforge.net 15 16(Amendments to file reading Daniel Pozzi for v1.1) 17 1811/June/2005 - v1.53 Mark Grimshaw: Stopped expansion of @string when entry is enclosed in {...} or "..." 1921/08/2004 v1.4 Guillaume Gardey, Added PHP string parsing and expand macro features. 20 Fix bug with comments, strings macro. 21 expandMacro = FALSE/TRUE to expand string macros. 22 loadStringMacro($bibtex_string) to load a string. (array of lines) 2322/08/2004 v1.4 Mark Grimshaw - a few adjustments to Guillaume's code. 2428/04/2005 v1.5 Mark Grimshaw - a little debugging for @preamble 25 2602/05/2005 G. Gardey - Add support for @string macro defined by curly brackets: 27 @string{M12 = {December}} 28 - Don't expand macro for bibtexCitation and bibtexEntryType 29 - Better support for fields like journal = {Journal of } # JRNL23 3003/05/2005 G. Gardey - Fix wrong field value parsing when an entry ends by 31 someField = {value}} 32 33*/ 34 35// For a quick command-line test (php -f PARSEENTRIES.php) after installation, uncomment these lines: 36 37/************************* 38// Parse a file 39 $parse = NEW PARSEENTRIES(); 40 $parse->expandMacro = TRUE; 41// $array = array("RMP" =>"Rev., Mod. Phys."); 42// $parse->loadStringMacro($array); 43// $parse->removeDelimit = FALSE; 44// $parse->fieldExtract = FALSE; 45 $parse->openBib("bib.bib"); 46 $parse->extractEntries(); 47 $parse->closeBib(); 48 list($preamble, $strings, $entries) = $parse->returnArrays(); 49 print_r($preamble); 50 print "\n"; 51 print_r($strings); 52 print "\n"; 53 print_r($entries); 54 print "\n\n"; 55*************************/ 56 57/************************ 58// Parse a bibtex PHP string 59 $bibtex_data = <<< END 60 61@STRING{three = "THREE"} 62@STRING{two = "TWO"} 63@string{JRNL23 = {NatLA 23}} 64 65 66@article{klitzing.1, 67 author = "v. Klitzing and Dorda and Pepper", 68 title = "New method for high mark@sirfragalot.com accuracy determination of fine structure constant based on quantized hall resistance", 69 volume = "45", 70 journal = {{Journal of } # JRNL23 # two}, 71 pages = "494", 72 citeulike-article-id = {12222 73 } 74 , 75 ignoreMe = {blah}, } 76 77@article{klitzing.2, 78 author = "Klaus von Klitzing", 79 title = "The Quantized Hall Effect", 80 volume = "58", 81 journal = two, 82 pages = "519", 83} 84 85END; 86 87 $parse = NEW PARSEENTRIES(); 88 $parse->expandMacro = TRUE; 89// $parse->removeDelimit = FALSE; 90// $parse->fieldExtract = FALSE; 91 $array = array("RMP" =>"Rev., Mod. Phys."); 92 $parse->loadStringMacro($array); 93 $parse->loadBibtexString($bibtex_data); 94 $parse->extractEntries(); 95 list($preamble, $strings, $entries) = $parse->returnArrays(); 96 print_r($preamble); 97 print "\n"; 98 print_r($strings); 99 print "\n"; 100 print_r($entries); 101 print "\n\n"; 102 103**********************/ 104 105 106class PARSEENTRIES 107{ 108 function PARSEENTRIES() 109 { 110 $this->preamble = $this->strings = $this->entries = array(); 111 $this->count = 0; 112 $this->fieldExtract = TRUE; 113 $this->removeDelimit = TRUE; 114 $this->expandMacro = FALSE; 115 $this->parseFile = TRUE; 116 } 117// Open bib file 118 function openBib($file) 119 { 120 if(!is_file($file)) 121 die; 122 $this->fid = fopen ($file,'r'); 123// 22/08/2004 Mark Grimshaw - commented out as set in constructor. 124// 25/08/2004 G. Gardey needed in order to be able to alternate file parsing or PHP string parsing 125 $this->parseFile = TRUE; 126 } 127// Load a bibtex string to parse it 128 function loadBibtexString($bibtex_string) 129 { 130 if(is_string($bibtex_string)){ 131 $this->bibtexString = explode("\n",$bibtex_string); 132 } 133 else{ 134 $this->bibtexString = $bibtex_string; 135 } 136 $this->parseFile = FALSE; 137 $this->currentLine = 0; 138 } 139 // set strings macro 140 function loadStringMacro($macro_array){ 141 $this->userStrings = $macro_array; 142 } 143// Close bib file 144 function closeBib() 145 { 146 fclose($this->fid); 147 } 148// Get a line from bib file 149 function getLine() 150 { 151// 21/08/2004 G.Gardey 152// remove comments from parsing 153 if($this->parseFile){ 154 if(!feof($this->fid)){ 155 do{ 156 $line = trim(fgets($this->fid)); 157 $isComment = (strlen($line) > 0) ? $line[0] == '%' : FALSE; 158 } 159 while(!feof($this->fid) && $isComment); 160 return $line; 161 } 162 return FALSE; 163 } 164 else{ 165 do{ 166 $line = trim($this->bibtexString[$this->currentLine]); 167 $isComment = (strlen($line)>0) ? $line[0] == '%' : FALSE; 168 $this->currentLine++; 169 } 170 while($this->currentLine <count($this->bibtexString) && $isComment); 171 $val = ($this->currentLine < count($this->bibtexString)) ? $line : FALSE; 172 return $val; 173 } 174 } 175// Count entry delimiters 176 function braceCount($line, $delimitStart) 177 { 178 if($delimitStart == '{') 179 $delimitEnd = '}'; 180 else 181 { 182 $delimitStart = '('; 183 $delimitEnd = ')'; 184 } 185 $count = 0; 186 $count = substr_count($line, $delimitStart); 187 $count -= substr_count($line, $delimitEnd); 188 return $count; 189 } 190// Extract value part of @string field enclosed by double-quotes. 191 function extractStringValue($string) 192 { 193 // 2/05/2005 G. Gardey Add support for @string macro 194 // defined by curly bracket : @string{M12 = {December}} 195 $oldvalue = $this->expandMacro; 196 $this->expandMacro = false; 197 // $string contains a end delimiter 198 // remove it 199 $string = trim(substr($string,0,strlen($string)-1)); 200 // remove delimiters 201 $string = $this->removeDelimiters($string); 202 // restore expandMacro 203 $this->expandMacro = $oldvalue; 204 return $string; 205 } 206// Extract a field 207 function fieldSplit($seg) 208 { 209 // handle fields like another-field = {} 210 $array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE); 211 //$array = preg_split("/,\s*(\w+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE); 212 if(!array_key_exists(1, $array)) 213 return array($array[0], FALSE); 214 return array($array[0], $array[1]); 215 } 216// Extract and format fields 217 function reduceFields($oldString) 218 { 219 // 03/05/2005 G. Gardey. Do not remove all occurences, juste one 220 // * correctly parse an entry ended by: somefield = {aValue}} 221 $lg = strlen($oldString); 222 if($oldString[$lg-1] == "}" || $oldString[$lg-1] == ")" || $oldString[$lg-1] == ","){ 223 $oldString = substr($oldString,0,$lg-1); 224 } 225// $oldString = rtrim($oldString, "}),"); 226 $split = preg_split("/=/", $oldString, 2); 227 $string = $split[1]; 228 while($string) 229 { 230 list($entry, $string) = $this->fieldSplit($string); 231 $values[] = $entry; 232 } 233 foreach($values as $value) 234 { 235 $pos = strpos($oldString, $value); 236 $oldString = substr_replace($oldString, '', $pos, strlen($value)); 237 } 238 $rev = strrev(trim($oldString)); 239 if($rev{0} != ',') 240 $oldString .= ','; 241 $keys = preg_split("/=,/", $oldString); 242// 22/08/2004 - Mark Grimshaw 243// I have absolutely no idea why this array_pop is required but it is. Seems to always be an empty key at the end after the split 244// which causes problems if not removed. 245 array_pop($keys); 246 foreach($keys as $key) 247 { 248 $value = trim(array_shift($values)); 249 $rev = strrev($value); 250// remove any dangling ',' left on final field of entry 251 if($rev{0} == ',') 252 $value = rtrim($value, ","); 253 if(!$value) 254 continue; 255// 21/08/2004 G.Gardey -> expand macro 256// Don't remove delimiters now 257// needs to know if the value is a string macro 258// $this->entries[$this->count][strtolower(trim($key))] = trim($this->removeDelimiters(trim($value))); 259 $key = strtolower(trim($key)); 260 $value = trim($value); 261 $this->entries[$this->count][$key] = $value; 262 } 263 } 264// Start splitting a bibtex entry into component fields. 265// Store the entry type and citation. 266 function fullSplit($entry) 267 { 268 $matches = preg_split("/@(.*)\s*[{(](.*),/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE); 269 $this->entries[$this->count]['bibtexEntryType'] = strtolower($matches[1]); 270// sometimes a bibtex file will have no citation key 271 if(preg_match("/=/", $matches[2])) // this is a field 272 $matches = preg_split("/@(.*)\s*[{(](.*)/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE); 273//print_r($matches); print "<P>"; 274 $this->entries[$this->count]['bibtexCitation'] = $matches[2]; 275 $this->reduceFields($matches[3]); 276 } 277// Grab a complete bibtex entry 278 function getEntry($line) 279 { 280 $entry = ''; 281 $count = 0; 282 $lastLine = FALSE; 283 if(preg_match("/@(.*)\s*([{(])/", preg_quote($line), $matches)) 284 { 285 do 286 { 287 $count += $this->braceCount($line, $matches[2]); 288 $entry .= ' ' . $line; 289 if(($line = $this->getLine()) === FALSE) 290 break; 291 $lastLine = $line; 292 } 293 while($count); 294 295 } 296 else 297 { 298 $line .= $this->getLine(); 299 $this->getEntry($line); 300 } 301 if(!array_key_exists(1, $matches)) 302 return $lastLine; 303 if(preg_match("/string/i", $matches[1])) 304 $this->strings[] = $entry; 305 else if(preg_match("/preamble/i", $matches[1])) 306 $this->preamble[] = $entry; 307 else 308 { 309 if($this->fieldExtract) 310 $this->fullSplit($entry); 311 else 312 $this->entries[$this->count] = $entry; 313 $this->count++; 314 } 315 return $lastLine; 316 } 317 318 // 02/05/2005 G.Gardey only remove delimiters from a string 319 function removeDelimiters($string){ 320// MG 10/06/2005 - Make a note of whether delimiters exist - required in removeDelimitersAndExpand() otherwise, expansion happens everywhere including 321// inside {...} and "..." 322 $this->delimitersExist = FALSE; 323 if($string && ($string{0} == "\"")){ 324 $string = substr($string, 1); 325 $string = substr($string, 0, -1); 326 } 327 else if($string && ($string{0} == "{")) 328 { 329 if(strlen($string) > 0 && $string[strlen($string)-1] == "}"){ 330 $string = substr($string, 1); 331 $string = substr($string, 0, -1); 332 } 333 } 334 return $string; 335 } 336 337// Remove enclosures around entry field values. Additionally, expand macros if flag set. 338 function removeDelimitersAndExpand($string, $preamble = FALSE) 339 { 340 // 02/05/2005 G. Gardey 341 $string = $this->removeDelimiters($string); 342 $delimitersExist = $this->delimitersExist; 343// expand the macro if defined 344// 23/08/2004 Mark - changed isset() to !empty() since $this->strings isset in constructor. 345 if($string && $this->expandMacro) 346 { 347 if(!empty($this->strings) && !$preamble) 348 { 349// macro are case insensitive 350 foreach($this->strings as $key => $value) 351 { 352// 09/March/2005 - Mark Grimshaw - sometimes $key is empty - not sure why 353// if(!$key || !$value || !$string) 354// continue; 355 if(!$delimitersExist) 356 $string = eregi_replace($key, $value, $string); 357// 22/08/2004 Mark Grimshaw - make sure a '#' surrounded by any number of spaces is replaced by just one space. 358// 30/04/2005 Mark Grimshaw - ensure entries such as journal = {{Journal of } # JRNL23} are properly parsed 359// 02/05/2005 G. Gardey - another solution for the previous line 360 $items = split("#",$string); 361 $string = ""; 362 foreach($items as $val){ 363 $string .= $this->removeDelimiters(trim($val))." "; 364 } 365 366 $string = preg_replace("/\s+/", " ", $string); 367// $string = str_replace('#',' ',$string); 368} 369 } 370 if(!empty($this->userStrings)) 371 { 372 // 24/08/2004 G.Gardey replace user defined strings macro 373 foreach($this->userStrings as $key => $value) 374 { 375 $string = eregi_replace($key,$value,$string); 376 $string = preg_replace("/\s*#\s*/", " ", $string); 377 } 378 } 379 } 380 return $string; 381 } 382// This method starts the whole process 383 function extractEntries() 384 { 385 $lastLine = FALSE; 386 if($this->parseFile) 387 { 388 while(!feof($this->fid)) 389 { 390 $line = $lastLine ? $lastLine : $this->getLine(); 391 if(!preg_match("/^@/i", $line)) 392 continue; 393 if(($lastLine = $this->getEntry($line)) !== FALSE) 394 continue; 395 } 396 } 397 else{ 398 while($this->currentLine < count($this->bibtexString)) 399 { 400 $line = $lastLine ? $lastLine : $this->getLine(); 401 if(!preg_match("/^@/i", $line)) 402 continue; 403 if(($lastLine = $this->getEntry($line)) !== FALSE) 404 continue; 405 } 406 } 407 } 408// Return arrays of entries etc. to the calling process. 409 function returnArrays() 410 { 411 foreach($this->preamble as $value) 412 { 413 preg_match("/.*[{(](.*)/", $value, $matches); 414 $preamble = substr($matches[1], 0, -1); 415 $preambles['bibtexPreamble'] = trim($this->removeDelimitersAndExpand(trim($preamble), TRUE)); 416 } 417 if(isset($preambles)) 418 $this->preamble = $preambles; 419 if($this->fieldExtract) 420 { 421 foreach($this->strings as $value) 422 { 423// changed 21/08/2004 G. Gardey 424// 23/08/2004 Mark G. account for comments on same line as @string - count delimiters in string value 425 $value = trim($value); 426 $matches = preg_split("/@string\s*([{(])/i", $value, 2, PREG_SPLIT_DELIM_CAPTURE); 427 $delimit = $matches[1]; 428 $matches = preg_split("/=/", $matches[2], 2, PREG_SPLIT_DELIM_CAPTURE); 429 $strings[trim($matches[0])] = trim($this->extractStringValue($matches[1])); 430 } 431 } 432 if(isset($strings)) 433 $this->strings = $strings; 434 435// changed 21/08/2004 G. Gardey 436// 22/08/2004 Mark Grimshaw - stopped useless looping. 437// removeDelimit and expandMacro have NO effect if !$this->fieldExtract 438 if($this->removeDelimit || $this->expandMacro && $this->fieldExtract) 439 { 440 for($i = 0; $i < count($this->entries); $i++) 441 { 442 foreach($this->entries[$i] as $key => $value) 443 // 02/05/2005 G. Gardey don't expand macro for bibtexCitation 444 // and bibtexEntryType 445 if($key != 'bibtexCitation' && $key != 'bibtexEntryType'){ 446 $this->entries[$i][$key] = trim($this->removeDelimitersAndExpand($this->entries[$i][$key])); 447 } 448 } 449 } 450 if(empty($this->preamble)) 451 $this->preamble = FALSE; 452 if(empty($this->strings)) 453 $this->strings = FALSE; 454 if(empty($this->entries)) 455 $this->entries = FALSE; 456 return array($this->preamble, $this->strings, $this->entries); 457 } 458} 459?> 460