1<?php
2/*
3Inspired by an awk BibTeX parser written by Nelson H. F. Beebe over 20 years ago although little of that
4remains other than a highly edited braceCount().
5
6Released through http://bibliophile.sourceforge.net under the GPL licence.
7Do whatever you like with this -- some credit to the author(s) would be appreciated.
8
9A collection of PHP classes to manipulate bibtex files.
10
11If you make improvements, please consider contacting the administrators at bibliophile.sourceforge.net so that your improvements can be added to the release package.
12
13Mark Grimshaw 2005
14http://bibliophile.sourceforge.net
15
16(Amendments to file reading Daniel Pozzi for v1.1)
17
1811/June/2005 - v1.53 Mark Grimshaw:  Stopped expansion of @string when entry is enclosed in {...} or "..."
1921/08/2004 v1.4 Guillaume Gardey, Added PHP string parsing and expand macro features.
20 Fix bug with comments, strings macro.
21    expandMacro = FALSE/TRUE to expand string macros.
22    loadStringMacro($bibtex_string) to load a string. (array of lines)
2322/08/2004 v1.4 Mark Grimshaw - a few adjustments to Guillaume's code.
2428/04/2005 v1.5 Mark Grimshaw - a little debugging for @preamble
25
2602/05/2005 G. Gardey - Add support for @string macro defined by curly brackets:
27           @string{M12 = {December}}
28                     - Don't expand macro for bibtexCitation and bibtexEntryType
29                     - Better support for fields like journal = {Journal of } # JRNL23
3003/05/2005 G. Gardey - Fix wrong field value parsing when an entry ends by
31                           someField = {value}}
32
33*/
34
35// For a quick command-line test (php -f PARSEENTRIES.php) after installation, uncomment these lines:
36
37/*************************
38// Parse a file
39	$parse = NEW PARSEENTRIES();
40	$parse->expandMacro = TRUE;
41//	$array = array("RMP" =>"Rev., Mod. Phys.");
42//	$parse->loadStringMacro($array);
43//	$parse->removeDelimit = FALSE;
44//	$parse->fieldExtract = FALSE;
45	$parse->openBib("bib.bib");
46	$parse->extractEntries();
47	$parse->closeBib();
48	list($preamble, $strings, $entries) = $parse->returnArrays();
49	print_r($preamble);
50	print "\n";
51	print_r($strings);
52	print "\n";
53	print_r($entries);
54	print "\n\n";
55*************************/
56
57/************************
58// Parse a bibtex PHP string
59	$bibtex_data = <<< END
60
61@STRING{three = "THREE"}
62@STRING{two = "TWO"}
63@string{JRNL23 = {NatLA 23}}
64
65
66@article{klitzing.1,
67	author = "v. Klitzing and Dorda and Pepper",
68	title = "New method for high mark@sirfragalot.com accuracy determination of fine structure constant based on quantized hall resistance",
69	volume = "45",
70	journal = {{Journal of } # JRNL23 # two},
71	pages = "494",
72               citeulike-article-id = {12222
73    }
74               ,
75               ignoreMe = {blah}, }
76
77@article{klitzing.2,
78	author = "Klaus von Klitzing",
79	title = "The Quantized Hall Effect",
80	volume = "58",
81	journal = two,
82	pages = "519",
83}
84
85END;
86
87	$parse = NEW PARSEENTRIES();
88	$parse->expandMacro = TRUE;
89//	$parse->removeDelimit = FALSE;
90//	$parse->fieldExtract = FALSE;
91	$array = array("RMP" =>"Rev., Mod. Phys.");
92	$parse->loadStringMacro($array);
93	$parse->loadBibtexString($bibtex_data);
94	$parse->extractEntries();
95	list($preamble, $strings, $entries) = $parse->returnArrays();
96	print_r($preamble);
97	print "\n";
98	print_r($strings);
99	print "\n";
100	print_r($entries);
101	print "\n\n";
102
103**********************/
104
105
106class PARSEENTRIES
107{
108	function PARSEENTRIES()
109	{
110		$this->preamble = $this->strings = $this->entries = array();
111		$this->count = 0;
112		$this->fieldExtract = TRUE;
113		$this->removeDelimit = TRUE;
114	        $this->expandMacro = FALSE;
115	        $this->parseFile = TRUE;
116	}
117// Open bib file
118	function openBib($file)
119	{
120		if(!is_file($file))
121			die;
122		$this->fid = fopen ($file,'r');
123// 22/08/2004 Mark Grimshaw - commented out as set in constructor.
124// 25/08/2004 G. Gardey needed in order to be able to alternate file parsing or PHP string parsing
125		$this->parseFile = TRUE;
126	}
127// Load a bibtex string to parse it
128    function loadBibtexString($bibtex_string)
129    {
130        if(is_string($bibtex_string)){
131            $this->bibtexString = explode("\n",$bibtex_string);
132        }
133        else{
134            $this->bibtexString = $bibtex_string;
135        }
136        $this->parseFile = FALSE;
137        $this->currentLine = 0;
138    }
139    // set strings macro
140    function loadStringMacro($macro_array){
141        $this->userStrings = $macro_array;
142    }
143// Close bib file
144	function closeBib()
145	{
146		fclose($this->fid);
147	}
148// Get a line from bib file
149    function getLine()
150    {
151// 21/08/2004 G.Gardey
152// remove comments from parsing
153        if($this->parseFile){
154            if(!feof($this->fid)){
155                do{
156                    $line = trim(fgets($this->fid));
157                    $isComment = (strlen($line) > 0) ? $line[0] == '%' : FALSE;
158                }
159                while(!feof($this->fid) && $isComment);
160                return $line;
161            }
162            return FALSE;
163        }
164        else{
165            do{
166                $line = trim($this->bibtexString[$this->currentLine]);
167                $isComment = (strlen($line)>0) ? $line[0] == '%' : FALSE;
168                $this->currentLine++;
169            }
170            while($this->currentLine <count($this->bibtexString) && $isComment);
171            $val = ($this->currentLine < count($this->bibtexString)) ? $line : FALSE;
172            return $val;
173        }
174	}
175// Count entry delimiters
176	function braceCount($line, $delimitStart)
177	{
178		if($delimitStart == '{')
179			$delimitEnd = '}';
180		else
181		{
182			$delimitStart = '(';
183			$delimitEnd = ')';
184		}
185		$count = 0;
186		$count = substr_count($line, $delimitStart);
187		$count -= substr_count($line, $delimitEnd);
188		return $count;
189	}
190// Extract value part of @string field enclosed by double-quotes.
191	function extractStringValue($string)
192	{
193		// 2/05/2005 G. Gardey Add support for @string macro
194        // defined by curly bracket : @string{M12 = {December}}
195        $oldvalue = $this->expandMacro;
196        $this->expandMacro = false;
197		// $string contains a end delimiter
198        // remove it
199        $string = trim(substr($string,0,strlen($string)-1));
200        // remove delimiters
201        $string = $this->removeDelimiters($string);
202        // restore expandMacro
203        $this->expandMacro = $oldvalue;
204        return $string;
205	}
206// Extract a field
207	function fieldSplit($seg)
208	{
209        // handle fields like another-field = {}
210        $array = preg_split("/,\s*([-_.:,a-zA-Z0-9]+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE);
211		//$array = preg_split("/,\s*(\w+)\s*={1}\s*/U", $seg, PREG_SPLIT_DELIM_CAPTURE);
212		if(!array_key_exists(1, $array))
213			return array($array[0], FALSE);
214		return array($array[0], $array[1]);
215	}
216// Extract and format fields
217	function reduceFields($oldString)
218	{
219        // 03/05/2005 G. Gardey. Do not remove all occurences, juste one
220        //              * correctly parse an entry ended by: somefield = {aValue}}
221        $lg = strlen($oldString);
222        if($oldString[$lg-1] == "}" || $oldString[$lg-1] == ")" || $oldString[$lg-1] == ","){
223            $oldString = substr($oldString,0,$lg-1);
224        }
225//		$oldString = rtrim($oldString, "}),");
226		$split = preg_split("/=/", $oldString, 2);
227		$string = $split[1];
228		while($string)
229		{
230			list($entry, $string) = $this->fieldSplit($string);
231			$values[] = $entry;
232		}
233		foreach($values as $value)
234		{
235			$pos = strpos($oldString, $value);
236			$oldString = substr_replace($oldString, '', $pos, strlen($value));
237		}
238		$rev = strrev(trim($oldString));
239		if($rev{0} != ',')
240			$oldString .= ',';
241		$keys = preg_split("/=,/", $oldString);
242// 22/08/2004 - Mark Grimshaw
243// I have absolutely no idea why this array_pop is required but it is.  Seems to always be an empty key at the end after the split
244// which causes problems if not removed.
245		array_pop($keys);
246		foreach($keys as $key)
247		{
248			$value = trim(array_shift($values));
249			$rev = strrev($value);
250// remove any dangling ',' left on final field of entry
251			if($rev{0} == ',')
252				$value = rtrim($value, ",");
253			if(!$value)
254				continue;
255// 21/08/2004 G.Gardey -> expand macro
256// Don't remove delimiters now
257// needs to know if the value is a string macro
258//			$this->entries[$this->count][strtolower(trim($key))] = trim($this->removeDelimiters(trim($value)));
259			$key = strtolower(trim($key));
260			$value = trim($value);
261			$this->entries[$this->count][$key] = $value;
262		}
263	}
264// Start splitting a bibtex entry into component fields.
265// Store the entry type and citation.
266	function fullSplit($entry)
267	{
268		$matches = preg_split("/@(.*)\s*[{(](.*),/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE);
269		$this->entries[$this->count]['bibtexEntryType'] = strtolower($matches[1]);
270// sometimes a bibtex file will have no citation key
271		if(preg_match("/=/", $matches[2])) // this is a field
272			$matches = preg_split("/@(.*)\s*[{(](.*)/U", $entry, 2, PREG_SPLIT_DELIM_CAPTURE);
273//print_r($matches); print "<P>";
274		$this->entries[$this->count]['bibtexCitation'] = $matches[2];
275		$this->reduceFields($matches[3]);
276	}
277// Grab a complete bibtex entry
278	function getEntry($line)
279	{
280		$entry = '';
281		$count = 0;
282		$lastLine = FALSE;
283		if(preg_match("/@(.*)\s*([{(])/", preg_quote($line), $matches))
284		{
285			do
286			{
287				$count += $this->braceCount($line, $matches[2]);
288				$entry .= ' ' . $line;
289				if(($line = $this->getLine()) === FALSE)
290					break;
291				$lastLine = $line;
292			}
293			while($count);
294
295		}
296		else
297		{
298			$line .= $this->getLine();
299			$this->getEntry($line);
300		}
301		if(!array_key_exists(1, $matches))
302			return $lastLine;
303		if(preg_match("/string/i", $matches[1]))
304			$this->strings[] = $entry;
305		else if(preg_match("/preamble/i", $matches[1]))
306			$this->preamble[] = $entry;
307		else
308		{
309			if($this->fieldExtract)
310				$this->fullSplit($entry);
311			else
312				$this->entries[$this->count] = $entry;
313			$this->count++;
314		}
315		return $lastLine;
316	}
317
318	// 02/05/2005 G.Gardey	only remove delimiters from a string
319	function removeDelimiters($string){
320// MG 10/06/2005 - Make a note of whether delimiters exist - required in removeDelimitersAndExpand() otherwise, expansion happens everywhere including
321// inside {...} and "..."
322		$this->delimitersExist = FALSE;
323	   if($string  && ($string{0} == "\"")){
324			$string = substr($string, 1);
325			$string = substr($string, 0, -1);
326        }
327		else if($string && ($string{0} == "{"))
328		{
329            if(strlen($string) > 0 && $string[strlen($string)-1] == "}"){
330                $string = substr($string, 1);
331                $string = substr($string, 0, -1);
332            }
333		}
334		return $string;
335	}
336
337// Remove enclosures around entry field values.  Additionally, expand macros if flag set.
338	function removeDelimitersAndExpand($string, $preamble = FALSE)
339	{
340	   // 02/05/2005 G. Gardey
341		$string = $this->removeDelimiters($string);
342		$delimitersExist = $this->delimitersExist;
343// expand the macro if defined
344// 23/08/2004 Mark - changed isset() to !empty() since $this->strings isset in constructor.
345		if($string && $this->expandMacro)
346		{
347            if(!empty($this->strings) && !$preamble)
348			{
349// macro are case insensitive
350                foreach($this->strings as $key => $value)
351				{
352// 09/March/2005 - Mark Grimshaw - sometimes $key is empty - not sure why
353//			if(!$key || !$value || !$string)
354//				continue;
355					if(!$delimitersExist)
356						$string = eregi_replace($key, $value, $string);
357// 22/08/2004 Mark Grimshaw - make sure a '#' surrounded by any number of spaces is replaced by just one space.
358// 30/04/2005 Mark Grimshaw - ensure entries such as journal = {{Journal of } # JRNL23} are properly parsed
359// 02/05/2005 G. Gardey - another solution for the previous line
360                    $items = split("#",$string);
361                    $string = "";
362                    foreach($items as $val){
363                        $string .= $this->removeDelimiters(trim($val))." ";
364                    }
365
366                    $string = preg_replace("/\s+/", " ", $string);
367//            				$string = str_replace('#',' ',$string);
368}
369                }
370        		if(!empty($this->userStrings))
371                {
372                    // 24/08/2004 G.Gardey replace user defined strings macro
373                    foreach($this->userStrings as $key => $value)
374                    {
375                        $string = eregi_replace($key,$value,$string);
376                        $string = preg_replace("/\s*#\s*/", " ", $string);
377                    }
378                }
379        	}
380            return $string;
381        }
382// This method starts the whole process
383	function extractEntries()
384	{
385        $lastLine = FALSE;
386        if($this->parseFile)
387        {
388            while(!feof($this->fid))
389            {
390                $line = $lastLine ? $lastLine : $this->getLine();
391                if(!preg_match("/^@/i", $line))
392                    continue;
393                if(($lastLine = $this->getEntry($line)) !== FALSE)
394                    continue;
395            }
396        }
397        else{
398            while($this->currentLine < count($this->bibtexString))
399            {
400                $line = $lastLine ? $lastLine : $this->getLine();
401                if(!preg_match("/^@/i", $line))
402                    continue;
403                if(($lastLine = $this->getEntry($line)) !== FALSE)
404                    continue;
405            }
406        }
407	}
408// Return arrays of entries etc. to the calling process.
409	function returnArrays()
410	{
411		foreach($this->preamble as $value)
412		{
413			preg_match("/.*[{(](.*)/", $value, $matches);
414			$preamble = substr($matches[1], 0, -1);
415			$preambles['bibtexPreamble'] = trim($this->removeDelimitersAndExpand(trim($preamble), TRUE));
416		}
417		if(isset($preambles))
418			$this->preamble = $preambles;
419		if($this->fieldExtract)
420		{
421			foreach($this->strings as $value)
422			{
423// changed 21/08/2004 G. Gardey
424// 23/08/2004 Mark G. account for comments on same line as @string - count delimiters in string value
425				$value = trim($value);
426				$matches = preg_split("/@string\s*([{(])/i", $value, 2, PREG_SPLIT_DELIM_CAPTURE);
427				$delimit = $matches[1];
428				$matches = preg_split("/=/", $matches[2], 2, PREG_SPLIT_DELIM_CAPTURE);
429				$strings[trim($matches[0])] = trim($this->extractStringValue($matches[1]));
430			}
431		}
432	        if(isset($strings))
433    	        	$this->strings = $strings;
434
435// changed 21/08/2004 G. Gardey
436// 22/08/2004 Mark Grimshaw - stopped useless looping.
437// removeDelimit and expandMacro have NO effect if !$this->fieldExtract
438		if($this->removeDelimit || $this->expandMacro && $this->fieldExtract)
439		{
440			for($i = 0; $i < count($this->entries); $i++)
441			{
442		            	foreach($this->entries[$i] as $key => $value)
443		            	     // 02/05/2005 G. Gardey don't expand macro for bibtexCitation
444		            	     // and bibtexEntryType
445		            	     if($key != 'bibtexCitation' && $key != 'bibtexEntryType'){
446    		                	$this->entries[$i][$key] = trim($this->removeDelimitersAndExpand($this->entries[$i][$key]));
447    		              }
448		        }
449		}
450		if(empty($this->preamble))
451			$this->preamble = FALSE;
452		if(empty($this->strings))
453			$this->strings = FALSE;
454		if(empty($this->entries))
455			$this->entries = FALSE;
456		return array($this->preamble, $this->strings, $this->entries);
457	}
458}
459?>
460