1<?php
2/**
3 * DokuWiki Plugin bibtex4dw (BibTeX Parser Component)
4 *
5 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
6 * @author  Till Biskup <till@till-biskup.de>
7 * @version 0.2
8 * @date    2023-05-28
9 */
10
11/**
12 * This class is based originally on the PHP PEAR package
13 * Structures_BibTeX, (c) 1997-2005 The PHP Group, Elmar Pitschke
14 * For more information about the original PEAR package, please visit
15 * http://pear.php.net/package/Structures_BibTex
16 *
17 * Some additional modifications to the original PHP PEAR package have
18 * been made by Raphael Reitzig in 2010 for his bib2tpl program.
19 * For more information about the bib2tpl program, please visit
20 * http://lmazy.verrech.net/bib2tpl/
21 *
22 * During transition from the original PHP PEAR package to this class forming
23 * part of the Dokuwiki Plugin bibtex, several unneccessary functions as the
24 * output to HTML and RTF have been removed, as well as the dependency on PEAR.
25 *
26 * Other functions as handling of BibTeX's @STRING patterns and a basic
27 * parsing for LaTeX code common for BibTeX entries (i.e. \emph{}) have been added.
28 *
29 * This class is no longer PHP 4 compatible, as was the original PEAR package.
30 */
31
32class bibtexparser_plugin_bibtex4dw
33{
34    /**
35     * Handle to SQLite db
36     */
37    public static $sqlite = array();
38    /**
39     * Array with the BibTex Data
40     *
41     * @access public
42     * @var array
43     */
44    public $data = array();
45    /**
46     * String with the BibTex content
47     *
48     * @access public
49     * @var string
50     */
51    public $content;
52    /**
53     * Array with the BibTex Strings
54     *
55     * @access private
56     * @var array
57     */
58    private $_strings = array();
59    /**
60     * Array with the BibTex entries
61     *
62     * @access public
63     * @var array
64     */
65    public $entries = array();
66    /**
67     * Array with possible Delimiters for the entries
68     *
69     * @access private
70     * @var array
71     */
72    private $_delimiters;
73    /**
74     * Array with replacements for LaTeX commands in fields of entries
75     *
76     * The patterns are searched for only in LaTeX math mode ($...$)
77     *
78     * As the output is in HTML, the best is to use the named representatives
79     * of the respective signs.
80     *
81     * @access private
82     * @var array
83     */
84    private $_latexMathmodeReplacements = array(
85        '\to' => '&rarr;',
86        '\bullet' => '&bull;',
87        '\circ' => '&deg;',
88        '\varepsilon' => '&epsilon;',
89        '\vartheta' => '&thetasym;',
90        '\varpi' => '&piv;',
91        '\varrho' => '&rho;',
92        '\varsigma' => '&sigmaf;',
93        '\varphi' => '&phi;',
94        '\cdot' => '&middot;',
95        '\cdots' => '&middot;&middot;&middot;',
96        '\rm ' => ''
97    );
98    /**
99     * Array with Greek letters to replace the LaTeX commands in fields of entries
100     *
101     * The greek letters are searched for only in LaTeX math mode ($...$)
102     *
103     * They will be checked both for lower and upper letters, as these differ only
104     * in the first character of their respective name.
105     *
106     * Note: The LaTeX mathmode replacements (see above) will be done first, thus
107     *       it is possible to use that to deal with special greek characters as
108     *       \varepsilon.
109     *
110     * @access private
111     * @var array
112     */
113    private $_greekLetters = array(
114        'alpha','beta','gamma','delta','epsilon',
115        'zeta','eta','theta','iota','kappa',
116        'lambda','mu','nu','xi','omicron',
117        'pi','rho','sigma','tau','upsilon',
118        'phi','chi','psi','omega',
119    );
120    /**
121     * Array to store warnings
122     *
123     * @access public
124     * @var array
125     */
126    public $warnings = array();
127    /**
128     * Run-time configuration options
129     *
130     * @access private
131     * @var array
132     */
133    private $_options;
134    /**
135     * Array with the "allowed" entry types
136     *
137     * @access public
138     * @var array
139     */
140    public $allowedEntryTypes;
141    /**
142     * Author Format Strings
143     *
144     * @access public
145     * @var string
146     */
147    public $authorstring;
148
149    /**
150     * List of SQL statements to be inserted at once
151     *
152     * @access private
153     * @var array
154     */
155    private $_sqlStatements = array();
156
157    /**
158     * Constructor
159     *
160     * @access public
161     * @return void
162     */
163    function __construct($options = array())
164    {
165        $this->_delimiters     = array('"'=>'"',
166                                        '{'=>'}');
167        $this->data            = array();
168        $this->content         = '';
169        //$this->_stripDelimiter = $stripDel;
170        //$this->_validate       = $val;
171        $this->warnings        = array();
172        $this->_options        = array(
173            'replaceLatex'      => true,
174            'stripDelimiter'    => true,
175            'validate'          => true,
176            'unwrap'            => false,
177            'wordWrapWidth'     => false,
178            'wordWrapBreak'     => "\n",
179            'wordWrapCut'       => 0,
180            'removeCurlyBraces' => true,
181            'extractAuthors'    => true,
182        );
183        foreach ($options as $option => $value) {
184            $test = $this->setOption($option, $value);
185        }
186        $this->allowedEntryTypes = array(
187            'article',
188            'book',
189            'booklet',
190            'conference',
191            'inbook',
192            'incollection',
193            'inproceedings',
194            'manual',
195            'mastersthesis',
196            'misc',
197            'phdthesis',
198            'proceedings',
199            'techreport',
200            'unpublished'
201        );
202        $this->authorstring = 'VON LAST, JR, FIRST';
203        $this->authordelimiter = '; ';
204    }
205
206    /**
207     * Sets run-time configuration options
208     *
209     * @access public
210     * @param string $option option name
211     * @param mixed  $value value for the option
212     * @return mixed true on success (DW msg on failure)
213     */
214    public function setOption($option, $value)
215    {
216        $ret = true;
217        if (array_key_exists($option, $this->_options)) {
218            $this->_options[$option] = $value;
219        } else {
220            msg("Unknown option $option", 2);
221            $ret = false;
222        }
223        return $ret;
224    }
225
226    /**
227     * Reads a given BibTex File
228     *
229     * @access public
230     * @param string $filename Name of the file
231     * @return mixed true on success (DW msg on failure)
232     */
233    public function loadFile($filename)
234    {
235        if (file_exists($filename)) {
236            if (($this->content = @file_get_contents($filename)) === false) {
237                msg("Could not open file $filename", 2);
238            } else {
239                $this->_pos    = 0;
240                $this->_oldpos = 0;
241                return true;
242            }
243        } else {
244            msg("Could not find file $filename", 2);
245        }
246    }
247
248    /**
249     * Reads bibtex from a string variable
250     *
251     * @access public
252     * @param string $bib String containing bibtex
253     * @return boolean true
254     */
255    public function loadString($bib)
256    {
257        $this->content = $bib;
258        $this->_pos    = 0;
259        $this->_oldpos = 0;
260        return true; // For compatibility with loadFile
261    }
262
263    /**
264     * Parse bibliography stored in content and clear the content if the parsing is successful.
265     *
266     * @access public
267     * @return boolean true on success and PEAR_Error if there was a problem
268     */
269    public function parseBibliography($sqlite = false)
270    {
271        //The amount of opening braces is compared to the amount of closing braces
272        //Braces inside comments are ignored
273        $this->warnings = array();
274        $this->data     = array();
275        $valid          = true;
276        $open           = 0;
277        $entry          = false;
278        $char           = '';
279        $lastchar       = '';
280        $buffer         = '';
281        $inField        = false;
282        $openInField    = 0;
283        $lastNonWsChar  = '';
284        for ($i = 0; $i < strlen($this->content); $i++) {
285            $char = substr($this->content, $i, 1);
286            if ((0 != $open) && ('@' == $char) && (!$inField)) {
287                if (!$this->_checkAt($buffer)) {
288                    $this->_generateWarning('WARNING_MISSING_END_BRACE', '', $buffer);
289                    //To correct the data we need to insert a closing brace
290                    $char     = '}';
291                    $i--;
292                }
293            }
294            if ((0 == $open) && ('@' == $char)) { //The beginning of an entry
295                $entry = true;
296            } elseif ($entry && ('{' == $char) && ('\\' != $lastchar)) { //Inside an entry and non quoted brace is opening
297                $open++;
298                if (!$inField && ($lastNonWsChar == '=')) {
299                    $inField = true;
300                } elseif ($inField) {
301                    $openInField++;
302                }
303            } elseif ($entry && ('}' == $char) && ('\\' != $lastchar)) { //Inside an entry and non quoted brace is closing
304                $open--;
305                if ($inField) {
306                    $openInField--;
307                    if ($openInField == 0) {
308                        $inField = false;
309                    }
310                }
311                if ($open < 0) { //More are closed than opened
312                    $valid = false;
313                }
314                if (0 == $open) { //End of entry
315                    $entry = false;
316                    // TODO: Some check for duplicate keys and issuing a warning if so?
317                    if ($sqlite) {
318                        $this->_prepareSqlStatement($buffer);
319                    } else {
320                        $this->_storeEntryInClass($buffer);
321                    }
322                    $buffer = '';
323                }
324            }
325            if ($entry) { //Inside entry
326                $buffer .= $char;
327            }
328            $lastchar = $char;
329            if ($char != ' ' && $char != '\t' && $char != '\n' && $char != '\r') {
330                $lastNonWsChar = $char;
331            }
332        }
333        //If open is one it may be possible that the last ending brace is missing
334        // TODO: Handle situation with using SQLite DB
335        if (1 == $open) {
336            $entrydata = $this->_parseEntry($buffer);
337            if (!$entrydata) {
338                $valid = false;
339            } else {
340                $this->data[] = $entrydata;
341                $buffer = '';
342                $open   = 0;
343            }
344        }
345        if ($sqlite) {
346            $this->_executeSqlStatements();
347        }
348        //At this point the open should be zero
349        if (0 != $open) {
350            $valid = false;
351        }
352        //Are there multiple entries with the same cite?
353        // TODO: Meanwhile, as in both cases (SQLite and manual) bibtex keys are used as index,
354        //       this situation shall no longer exist. Checking for duplicate keys needs be done above.
355        if ($this->_options['validate']) {
356            $cites = array();
357            foreach ($this->data as $entry) {
358                $cites[] = $entry['cite'];
359            }
360            $unique = array_unique($cites);
361            if (sizeof($cites) != sizeof($unique)) { //Some values have not been unique!
362                $notuniques = array();
363                for ($i = 0; $i < sizeof($cites); $i++) {
364                    if ('' == $unique[$i]) {
365                        $notuniques[] = $cites[$i];
366                    }
367                }
368                $this->_generateWarning('WARNING_MULTIPLE_ENTRIES', implode(',',$notuniques));
369            }
370        }
371        if ($valid) {
372            $this->content = '';
373            return true;
374        } else {
375            return false;
376        }
377    }
378
379    /**
380     * Parses what is stored in content and clears the content if the parsing is successful.
381     *
382     * @access public
383     * @return boolean true on success and PEAR_Error if there was a problem
384     */
385    public function parse($sqlite = false)
386    {
387        //The amount of opening braces is compared to the amount of closing braces
388        //Braces inside comments are ignored
389        $this->warnings = array();
390        $this->data     = array();
391        $valid          = true;
392        $open           = 0;
393        $entry          = false;
394        $char           = '';
395        $lastchar       = '';
396        $buffer         = '';
397        for ($i = 0; $i < strlen($this->content); $i++) {
398            $char = substr($this->content, $i, 1);
399            if ((0 != $open) && ('@' == $char)) {
400                if (!$this->_checkAt($buffer)) {
401                    $this->_generateWarning('WARNING_MISSING_END_BRACE', '', $buffer);
402                    //To correct the data we need to insert a closing brace
403                    $char     = '}';
404                    $i--;
405                }
406            }
407            if ((0 == $open) && ('@' == $char)) { //The beginning of an entry
408                $entry = true;
409            } elseif ($entry && ('{' == $char) && ('\\' != $lastchar)) { //Inside an entry and non quoted brace is opening
410                $open++;
411            } elseif ($entry && ('}' == $char) && ('\\' != $lastchar)) { //Inside an entry and non quoted brace is closing
412                $open--;
413                if ($open < 0) { //More are closed than opened
414                    $valid = false;
415                }
416                if (0 == $open) { //End of entry
417                    $entry     = false;
418                    if ($sqlite) {
419                        $this->_addEntryToSQLiteDB($buffer);
420                    } else {
421                        $entrydata = $this->_parseEntry($buffer);
422                        if ($entrydata) {
423                            $this->data[] = $entrydata;
424                        }
425                    }
426                    $buffer = '';
427                }
428            }
429            if ($entry) { //Inside entry
430                $buffer .= $char;
431            }
432            $lastchar = $char;
433        }
434        //If open is one it may be possible that the last ending brace is missing
435        // TODO: Handle situation with using SQLite DB
436        if (1 == $open) {
437            $entrydata = $this->_parseEntry($buffer);
438            if (!$entrydata) {
439                $valid = false;
440            } else {
441                $this->data[] = $entrydata;
442                $buffer = '';
443                $open   = 0;
444            }
445        }
446        //At this point the open should be zero
447        if (0 != $open) {
448            $valid = false;
449        }
450        //Are there multiple entries with the same cite?
451        if ($this->_options['validate']) {
452            $cites = array();
453            foreach ($this->data as $entry) {
454                $cites[] = $entry['cite'];
455            }
456            $unique = array_unique($cites);
457            if (sizeof($cites) != sizeof($unique)) { //Some values have not been unique!
458                $notuniques = array();
459                for ($i = 0; $i < sizeof($cites); $i++) {
460                    if ('' == $unique[$i]) {
461                        $notuniques[] = $cites[$i];
462                    }
463                }
464                $this->_generateWarning('WARNING_MULTIPLE_ENTRIES', implode(',',$notuniques));
465            }
466        }
467        if ($valid) {
468            $this->content = '';
469            return true;
470        } else {
471            return false;
472        }
473    }
474
475    /**
476     * Split entry in key and actual contents, call stringCallback for @string entries and bibItemCallback for all other entries.
477     *
478     * @param string $entry BibTeX entry, starting with @ and ending BEFORE the closing brace of the entry
479     * @param callable $stringCallback Will be called with two arguments (key, value) for @string entries
480     * @param callable $bibItemCallback Will be called with two arguments (key, full entry as string) for all non-@string entries
481     */
482    private function _storeBibTeXEntry($entry, $stringCallback, $bibItemCallback)
483    {
484        if ('@string' ==  strtolower(substr($entry, 0, 7))) {
485            $matches = array();
486            preg_match('/^@\w+\{(.+)/', $entry, $matches);
487            if (count($matches) > 0) {
488                $m = explode('=', $matches[1], 2);
489                $string = trim($m[0]);
490                $entry = substr(trim($m[1]), 1, -1);
491                call_user_func($stringCallback, $string, $entry);
492                return;
493            }
494        } else {
495            $entry = $entry.'}';
496            // Look for key
497            $matches = array();
498            preg_match('/^@(\w+)\{(.+),/', $entry, $matches);
499            if (count($matches) > 0) {
500                $entryType = $matches[1];
501                $key = $matches[2];
502                call_user_func($bibItemCallback, $key, $entry);
503                return;
504            }
505        }
506        throw new InvalidArgumentException('Could not parse entry "'.$entry.'"');
507    }
508
509    /**
510     * Store given entry in this object's members
511     *
512     * @param string $entry BibTeX entry, starting with @ and ending BEFORE the closing brace of the entry
513     */
514    private function _storeEntryInClass($entry)
515    {
516        $stringCallback = fn($key, $value) => $this->_strings[$key] = $value;
517        $bibItemCallback = fn($key, $value) => $this->entries[$key] = $value;
518        $this->_storeBibTeXEntry($entry,  $stringCallback, $bibItemCallback);
519    }
520
521    /**
522     * Add/update entry in SQLite DB (immediately)
523     */
524    private function _addEntryToSQLiteDB($entry)
525    {
526        $stringCallback = fn($key, $value) => $this->sqlite->query("INSERT OR REPLACE INTO strings (string, entry) VALUES (?,?)", $key, $value);
527        $bibItemCallback = fn($key, $value) => $this->sqlite->query("INSERT OR REPLACE INTO bibtex (key, entry) VALUES (?,?)", $key, $value);
528        $this->_storeBibTeXEntry($entry,  $stringCallback, $bibItemCallback);
529    }
530
531    /**
532     * Prepare an SQL statement to insert/update $entry in the DB.
533     */
534    private function _prepareSqlStatement($entry)
535    {
536        $stringCallback = fn($key, $value) => $this->_sqlStatements[] = array("INSERT OR REPLACE INTO strings (string, entry) VALUES (?,?)", array($key, $value));
537        $bibItemCallback = fn($key, $value) => $this->_sqlStatements[] = array("INSERT OR REPLACE INTO bibtex (key, entry) VALUES (?,?)", array($key, $value));
538        $this->_storeBibTeXEntry($entry,  $stringCallback, $bibItemCallback);
539    }
540
541    /**
542     * Execute all statements in $this->_sqlStatments in a single transaction.
543     *
544     * A single transaction is MUCH faster than executing statements sequentially.
545     */
546    private function _executeSqlStatements()
547    {
548        $pdo = $this->sqlite->getAdapter()->getPdo();
549        try {
550            if(!$pdo->beginTransaction()) {
551                msg('Sqlite error when starting transaction.', -1);
552                return;
553            }
554            foreach ($this->_sqlStatements as $statement) {
555                list($sql, $params) = $statement;
556                $pdo_stmt = $pdo->prepare($sql);
557                $pdo_stmt->execute($params);
558            }
559            if(!$pdo->commit()) {
560                msg('Sqlite error during commit.', -1);
561                return;
562            }
563        } catch (PDOException $ex) {
564            $pdo->rollBack();
565            throw $ex; // TODO handle this case, e.g., by falling back to single queries?
566        }
567        $this->_sqlStatements = array();
568    }
569
570    /**
571     * Extracting the data of one bibtex entry
572     *
573     * The parse function splits the content into its entries.
574     * Then every entry is parsed by this function.
575     * It parses the entry backwards.
576     * First the last '=' is searched and the value extracted from that.
577     * A copy is made of the entry if warnings should be generated. This takes quite
578     * some memory but it is needed to get good warnings. If no warnings are generated
579     * then you don't have to worry about memory.
580     * Then the last ',' is searched and the field extracted from that.
581     * Again the entry is shortened.
582     * Finally after all field=>value pairs the cite and type is extraced and the
583     * authors are splitted.
584     * If there is a problem false is returned.
585     *
586     * @access private
587     * @param string $entry The entry
588     * @return array The representation of the entry or false if there is a problem
589     */
590    private function _parseEntry($entry)
591    {
592        $entrycopy = '';
593        if ($this->_options['validate']) {
594            $entrycopy = $entry; //We need a copy for printing the warnings
595        }
596        $ret = array('bibtex' => $entry.'}');
597        if ('@string' ==  strtolower(substr($entry, 0, 7))) {
598            $matches = array();
599            preg_match('/^@\w+\{(.+)/' ,$entry, $matches);
600            if ( count($matches) > 0 )
601            {
602                $m = explode('=',$matches[1],2);
603                $this->_strings[trim($m[0])] = substr(trim($m[1]),1,-1);
604            }
605        } elseif ('@preamble' ==  strtolower(substr($entry, 0, 9))) {
606            //Preamble not yet supported!
607            if ($this->_options['validate']) {
608                $this->_generateWarning('PREAMBLE_ENTRY_NOT_YET_SUPPORTED', '', $entry.'}');
609            }
610        } else {
611            // Look for key
612            $matches = array();
613            preg_match('/^@\w+\{([\w\d]+),/' ,$entry, $matches);
614            if ( count($matches) > 0 )
615            {
616              $ret['entrykey'] = $matches[1];
617            }
618
619            //Parsing all fields
620            while (strrpos($entry,'=') !== false) {
621                $position = strrpos($entry, '=');
622                //Checking that the equal sign is not quoted or is not inside a equation (For example in an abstract)
623                $proceed  = true;
624                if (substr($entry, $position-1, 1) == '\\') {
625                    $proceed = false;
626                }
627                if ($proceed) {
628                    $proceed = $this->_checkEqualSign($entry, $position);
629                }
630                while (!$proceed) {
631                    $substring = substr($entry, 0, $position);
632                    $position  = strrpos($substring,'=');
633                    $proceed   = true;
634                    if (substr($entry, $position-1, 1) == '\\') {
635                        $proceed = false;
636                    }
637                    if ($proceed) {
638                        $proceed = $this->_checkEqualSign($entry, $position);
639                    }
640                }
641
642                $value = trim(substr($entry, $position+1));
643                $entry = substr($entry, 0, $position);
644
645                if (',' == substr($value, strlen($value)-1, 1)) {
646                    $value = substr($value, 0, -1);
647                }
648                if ($this->_options['validate']) {
649                    $this->_validateValue($value, $entrycopy);
650                }
651
652                // Handle string replacements
653                // IMPORTANT: Must precede stripDelimiter call
654                if (!in_array(substr($value,0,1),array_keys($this->_delimiters))) {
655                      if (!empty($this->sqlite)) {
656                        $stringReplacement = $this->sqlite->res2arr($this->sqlite->query("SELECT entry FROM strings WHERE string = ?",$value));
657                        if (!empty($stringReplacement)) {
658                            $value = $stringReplacement[0]['entry'];
659                        }
660                    } elseif (array_key_exists($value,$this->_strings)) {
661                        $value = $this->_strings[$value];
662                    }
663                }
664
665                if ($this->_options['replaceLatex']) {
666                    $value = $this->_replaceLatex($value);
667                }
668
669                if ($this->_options['stripDelimiter']) {
670                    $value = $this->_stripDelimiter($value);
671                }
672                if ($this->_options['unwrap']) {
673                    $value = $this->_unwrap($value);
674                }
675                if ($this->_options['removeCurlyBraces']) {
676                    $value = $this->_removeCurlyBraces($value);
677                }
678
679                $position    = strrpos($entry, ',');
680                $field       = strtolower(trim(substr($entry, $position+1)));
681                $ret[$field] = $value;
682                $entry       = substr($entry, 0, $position);
683            }
684            //Parsing cite and entry type
685            $arr = explode('{', $entry);
686            $ret['cite'] = trim($arr[1]);
687            $ret['entrytype'] = strtolower(trim($arr[0]));
688            if ('@' == $ret['entrytype'][0]) {
689                $ret['entrytype'] = substr($ret['entrytype'], 1);
690            }
691            if ($this->_options['validate']) {
692                if (!$this->_checkAllowedEntryType($ret['entrytype'])) {
693                    $this->_generateWarning('WARNING_NOT_ALLOWED_ENTRY_TYPE', $ret['entrytype'], $entry.'}');
694                }
695            }
696            //Handling the authors
697            if (in_array('author', array_keys($ret)) && $this->_options['extractAuthors']) {
698                // Array with all the authors in $ret['authors']
699                $ret['authors'] = $this->_extractAuthors($ret['author']);
700                // AuthorYear for sorting purposes in $ref['authoryear']
701                if (empty($ret['year'])) {
702                    if (!empty($ret['date']) && preg_match('|(\d\d\d\d).*|U', $ret['date'], $matches)) {
703                        $ret['year'] = $matches[1];
704                    } else {
705                        $ret['year'] = '[n.d.]';
706                    }
707                }
708                $ret['authoryear'] = $ret['authors'][0]['last'] . $ret['year'];
709                // Nicely formatted authors list in $ret['author']
710                $tmparray = array();
711                foreach ($ret['authors'] as $authorentry) {
712                    $tmparray[] = $this->_formatAuthor($authorentry);
713                }
714                $ret['author'] = implode($this->authordelimiter, $tmparray);
715            }
716            //Handling the editors
717            if (in_array('editor', array_keys($ret)) && $this->_options['extractAuthors']) {
718                // Array with all the editors in $ret['editors']
719                $ret['editors'] = $this->_extractAuthors($ret['editor']);
720                // Nicely formatted authors list in $ret['editor']
721                $tmparray = array();
722                foreach ($ret['editors'] as $editorentry) {
723                    $tmparray[] = $this->_formatAuthor($editorentry);
724                }
725                $ret['editor'] = implode($this->authordelimiter, $tmparray);
726            }
727        }
728        return $ret;
729    }
730
731    /**
732     * Parsing for a subset of LaTeX code that can be found more often in BibTeX entries
733     *
734     * TODO: Extend this as necessary
735     */
736    private function _replaceLatex($entry) {
737        // \emph{...} -> <em>...</em>
738        $entry = preg_replace('/\\\emph\{([^\}]+)\}/', '<em>$1</em>', $entry);
739        // \textbf{...} -> <strong>...</strong>
740        $entry = preg_replace('/\\\textbf\{([^\}]+)\}/', '<strong>$1</strong>', $entry);
741        // quotation marks
742        $entry = str_replace("``","&quot;",$entry);
743        $entry = str_replace("''","&quot;",$entry);
744        // \& -> &amp;
745        $entry = str_replace("\&","&amp;",$entry);
746        // \% -> %;
747        $entry = str_replace("\%","%;",$entry);
748        // "\ " -> " ";
749        $entry = str_replace("\ "," ",$entry);
750        // --- -> &mdash;
751        $entry = str_replace("---","&mdash;",$entry);
752        // -- -> -
753        $entry = str_replace("--","-",$entry);
754        // \url{...} -> ...
755        $entry = preg_replace("/\\\url\{([^\}]+)\}/",'<a href="\\1">\\1</a>',$entry);
756        // Handle umlauts
757        $entry = preg_replace('/\\\"\{([aeiouyAEIOU])\}/',"&\\1uml;",$entry);
758        $entry = preg_replace('/\\\"([aeiouyAEIOU])/',"&\\1uml;",$entry);
759        $entry = str_replace("\ss","&szlig;",$entry);
760        $entry = str_replace('"s',"&szlig;",$entry);
761        // Handle accents
762        // Handle acute
763        $entry = str_replace("\'c","&#x107;",$entry);
764        $entry = preg_replace("/\\\'(.?)/","&\\1acute;",$entry);
765        // Handle grave
766        $entry = preg_replace("/\\\`(.?)/","&\\1grave;",$entry);
767        // Handle circumflex
768        $entry = preg_replace("/\\\(\^)(.?)/","&\\2circ;",$entry);
769        // Handle hatschek
770        $entry = str_replace('\v{z}',"&#x17E;",$entry);
771        $entry = str_replace('\v{c}',"&#x10D;",$entry);
772        // Handle cedille
773        $entry = preg_replace("/\\\c\{(.?)\}/","\\1&#x0327;",$entry);
774        // Handle tilde
775        $entry = preg_replace("/\\\~(.?)/","&\\1tilde;",$entry);
776        // ae and oe ligatures
777        $entry = preg_replace('/\\\([aoAO]{1}[eE]{1})/',"&\\1lig;",$entry);
778        // Handle i without dot
779        $entry = str_replace("\i","&#305;",$entry);
780        // Handle u with bar
781        $entry = str_replace("\={u}","&#363;",$entry);
782        // Handle \l and \L
783        $entry = str_replace("\l","&#322;",$entry);
784        $entry = str_replace("\L","&#321;",$entry);
785
786        // \o and \O
787        $entry = preg_replace('/\\\([oO]{1})/',"&\\1slash;",$entry);
788        // \aa and \AA
789        $entry = preg_replace('/\\\([aA]{1})([aA]{1})/',"&\\1ring;",$entry);
790        // Replace remaining "~" with "&nbsp;"
791        $entry = str_replace("~","&nbsp;",$entry);
792        // Handle math ($...$)
793        preg_match('/\$([^$]+)\$/' ,$entry, $matches);
794        if ( count($matches) > 0 ) {
795            foreach ($matches as $match) {
796                // Fix superscript and subscript
797                $entry = preg_replace("/\^\{([^\}]+)\}/","<sup>\\1</sup>",$entry);
798                $entry = preg_replace("/_\{([^\}]+)\}/","<sub>\\1</sub>",$entry);
799                $entry = preg_replace("/\^([\\\]{1}\w+)/","<sup>\\1</sup>",$entry);
800                $entry = preg_replace("/_([\\\]{1}\w+)/","<sub>\\1</sub>",$entry);
801                $entry = preg_replace("/\^([^\\\]{1})/","<sup>\\1</sup>",$entry);
802                $entry = preg_replace("/_([^\\\]{1})/","<sub>\\1</sub>",$entry);
803                // Replace LaTeX math commands, e.g. "\to"
804                foreach ($this->_latexMathmodeReplacements as $orig => $repl) {
805                    $entry = str_replace($orig,$repl,$entry);
806                }
807                // Replace both lowercase and uppercase Greek letters
808                foreach ($this->_greekLetters as $letter) {
809                    $upLatex = '\\' . ucfirst($letter);
810                    $upHtml = "&" . ucfirst($letter) . ";";
811                    $loLatex = '\\' . $letter;
812                    $loHtml = "&" . $letter . ";";
813                    $entry = str_replace($upLatex,$upHtml,$entry);
814                    $entry = str_replace($loLatex,$loHtml,$entry);
815                }
816            }
817            // Finally, remove the LaTeX mathmode $ delimiters
818            $entry = str_replace("$","",$entry);
819        }
820        return $entry;
821    }
822
823    /**
824     * Checking whether the position of the '=' is correct
825     *
826     * Sometimes there is a problem if a '=' is used inside an entry (for example abstract).
827     * This method checks if the '=' is outside braces then the '=' is correct and true is returned.
828     * If the '=' is inside braces it contains to a equation and therefore false is returned.
829     *
830     * @access private
831     * @param string $entry The text of the whole remaining entry
832     * @param int the current used place of the '='
833     * @return bool true if the '=' is correct, false if it contains to an equation
834     */
835    private function _checkEqualSign($entry, $position)
836    {
837        $ret = true;
838        //This is getting tricky
839        //We check the string backwards until the position and count the closing an opening braces
840        //If we reach the position the amount of opening and closing braces should be equal
841        $length = strlen($entry);
842        $open   = 0;
843        for ($i = $length-1; $i >= $position; $i--) {
844            $precedingchar = substr($entry, $i-1, 1);
845            $char          = substr($entry, $i, 1);
846            if (('{' == $char) && ('\\' != $precedingchar)) {
847                $open++;
848            }
849            if (('}' == $char) && ('\\' != $precedingchar)) {
850                $open--;
851            }
852        }
853        if (0 != $open) {
854            $ret = false;
855        }
856        //There is still the posibility that the entry is delimited by double quotes.
857        //Then it is possible that the braces are equal even if the '=' is in an equation.
858        if ($ret) {
859            $entrycopy = trim($entry);
860            $lastchar  = $entrycopy[strlen($entrycopy)-1];
861            if (',' == $lastchar) {
862                $lastchar = $entrycopy[strlen($entrycopy)-2];
863            }
864            if ('"' == $lastchar) {
865                //The return value is set to false
866                //If we find the closing " before the '=' it is set to true again.
867                //Remember we begin to search the entry backwards so the " has to show up twice - ending and beginning delimiter
868                $ret = false;
869                $found = 0;
870                for ($i = $length; $i >= $position; $i--) {
871                    $precedingchar = substr($entry, $i-1, 1);
872                    $char          = substr($entry, $i, 1);
873                    if (('"' == $char) && ('\\' != $precedingchar)) {
874                        $found++;
875                    }
876                    if (2 == $found) {
877                        $ret = true;
878                        break;
879                    }
880                }
881            }
882        }
883        return $ret;
884    }
885
886    /**
887     * Checking if the entry type is allowed
888     *
889     * @access private
890     * @param string $entry The entry to check
891     * @return bool true if allowed, false otherwise
892     */
893    private function _checkAllowedEntryType($entry)
894    {
895        return in_array($entry, $this->allowedEntryTypes);
896    }
897
898    /**
899     * Checking whether an at is outside an entry
900     *
901     * Sometimes an entry misses an entry brace. Then the at of the next entry seems to be
902     * inside an entry. This is checked here. When it is most likely that the at is an opening
903     * at of the next entry this method returns true.
904     *
905     * @access private
906     * @param string $entry The text of the entry until the at
907     * @return bool true if the at is correct, false if the at is likely to begin the next entry.
908     */
909    private function _checkAt($entry)
910    {
911        $ret     = false;
912        $opening = array_keys($this->_delimiters);
913        $closing = array_values($this->_delimiters);
914        //Getting the value (at is only allowd in values)
915        if (strrpos($entry,'=') !== false) {
916            $position = strrpos($entry, '=');
917            $proceed  = true;
918            if (substr($entry, $position-1, 1) == '\\') {
919                $proceed = false;
920            }
921            while (!$proceed) {
922                $substring = substr($entry, 0, $position);
923                $position  = strrpos($substring,'=');
924                $proceed   = true;
925                if (substr($entry, $position-1, 1) == '\\') {
926                    $proceed = false;
927                }
928            }
929            $value    = trim(substr($entry, $position+1));
930            $open     = 0;
931            $char     = '';
932            $lastchar = '';
933            for ($i = 0; $i < strlen($value); $i++) {
934                $char = substr($this->content, $i, 1);
935                if (in_array($char, $opening) && ('\\' != $lastchar)) {
936                    $open++;
937                } elseif (in_array($char, $closing) && ('\\' != $lastchar)) {
938                    $open--;
939                }
940                $lastchar = $char;
941            }
942            //if open is grater zero were are inside an entry
943            if ($open>0) {
944                $ret = true;
945            }
946        }
947        return $ret;
948    }
949
950    /**
951     * Stripping Delimiter
952     *
953     * @access private
954     * @param string $entry The entry where the Delimiter should be stripped from
955     * @return string Stripped entry
956     */
957    private function _stripDelimiter($entry)
958    {
959        $beginningdels = array_keys($this->_delimiters);
960        $length        = strlen($entry);
961        $firstchar     = substr($entry, 0, 1);
962        $lastchar      = substr($entry, -1, 1);
963        while (in_array($firstchar, $beginningdels)) { //The first character is an opening delimiter
964            if ($lastchar == $this->_delimiters[$firstchar]) { //Matches to closing Delimiter
965                $entry = substr($entry, 1, -1);
966            } else {
967                break;
968            }
969            $firstchar = substr($entry, 0, 1);
970            $lastchar  = substr($entry, -1, 1);
971        }
972        return $entry;
973    }
974
975    /**
976     * Unwrapping entry
977     *
978     * @access private
979     * @param string $entry The entry to unwrap
980     * @return string unwrapped entry
981     */
982    private function _unwrap($entry)
983    {
984        $entry = preg_replace('/\s+/', ' ', $entry);
985        return trim($entry);
986    }
987
988    /**
989     * Wordwrap an entry
990     *
991     * @access private
992     * @param string $entry The entry to wrap
993     * @return string wrapped entry
994     */
995    private function _wordwrap($entry)
996    {
997        if ( (''!=$entry) && (is_string($entry)) ) {
998            $entry = wordwrap($entry, $this->_options['wordWrapWidth'], $this->_options['wordWrapBreak'], $this->_options['wordWrapCut']);
999        }
1000        return $entry;
1001    }
1002
1003    /**
1004     * Extracting the authors
1005     *
1006     * @access private
1007     * @param string $entry The entry with the authors
1008     * @return array the extracted authors
1009     */
1010    private function _extractAuthors($entry) {
1011        $entry       = $this->_unwrap($entry);
1012        // Replace AND with and in author list - added 2010-12-12, till@till-bisup.de
1013        $entry       = str_replace(' AND ',' and ',$entry);
1014        $authorarray = array();
1015        $authorarray = explode(' and ', $entry);
1016        for ($i = 0; $i < sizeof($authorarray); $i++) {
1017            $author = trim($authorarray[$i]);
1018            /*The first version of how an author could be written (First von Last)
1019             has no commas in it*/
1020            $first    = '';
1021            $von      = '';
1022            $last     = '';
1023            $jr       = '';
1024            if (strpos($author, ',') === false) {
1025                $tmparray = array();
1026                $tmparray = explode(' ', $author);
1027                $size     = sizeof($tmparray);
1028                if (1 == $size) { //There is only a last
1029                    $last = $tmparray[0];
1030                } elseif (2 == $size) { //There is a first and a last
1031                    $first = $tmparray[0];
1032                    $last  = $tmparray[1];
1033                } else {
1034                    $invon  = false;
1035                    $inlast = false;
1036                    for ($j=0; $j<($size-1); $j++) {
1037                        if ($inlast) {
1038                            $last .= ' '.$tmparray[$j];
1039                        } elseif ($invon) {
1040                            $case = $this->_determineCase($tmparray[$j]);
1041                            if ((0 == $case) || (-1 == $case)) { //Change from von to last
1042                                //You only change when there is no more lower case there
1043                                $islast = true;
1044                                for ($k=($j+1); $k<($size-1); $k++) {
1045                                    $futurecase = $this->_determineCase($tmparray[$k]);
1046                                    if ($case == PHP_INT_MAX) {
1047                                        // Error case. IGNORE?
1048                                    } elseif (0 == $futurecase) {
1049                                        $islast = false;
1050                                    }
1051                                }
1052                                if ($islast) {
1053                                    $inlast = true;
1054                                    if (-1 == $case) { //Caseless belongs to the last
1055                                        $last .= ' '.$tmparray[$j];
1056                                    } else {
1057                                        $von  .= ' '.$tmparray[$j];
1058                                    }
1059                                } else {
1060                                    $von    .= ' '.$tmparray[$j];
1061                                }
1062                            } else {
1063                                $von .= ' '.$tmparray[$j];
1064                            }
1065                        } else {
1066                            $case = $this->_determineCase($tmparray[$j]);
1067                            if (0 == $case) { //Change from first to von
1068                                $invon = true;
1069                                $von   .= ' '.$tmparray[$j];
1070                            } else {
1071                                $first .= ' '.$tmparray[$j];
1072                            }
1073                        }
1074                    }
1075                    //The last entry is always the last!
1076                    $last .= ' '.$tmparray[$size-1];
1077                }
1078            } else { //Version 2 and 3
1079                $tmparray     = array();
1080                $tmparray     = explode(',', $author);
1081                //The first entry must contain von and last
1082                $vonlastarray = array();
1083                $vonlastarray = explode(' ', $tmparray[0]);
1084                $size         = sizeof($vonlastarray);
1085                if (1==$size) { //Only one entry->got to be the last
1086                    $last = $vonlastarray[0];
1087                } else {
1088                    $inlast = false;
1089                    for ($j=0; $j<($size-1); $j++) {
1090                        if ($inlast) {
1091                            $last .= ' '.$vonlastarray[$j];
1092                        } else {
1093                            if (0 != ($this->_determineCase($vonlastarray[$j]))) { //Change from von to last
1094                                $islast = true;
1095                                for ($k=($j+1); $k<($size-1); $k++) {
1096                                    $this->_determineCase($vonlastarray[$k]);
1097                                    $case = $this->_determineCase($vonlastarray[$k]);
1098                                    if (0 == $case) {
1099                                        $islast = false;
1100                                    }
1101                                }
1102                                if ($islast) {
1103                                    $inlast = true;
1104                                    $last   .= ' '.$vonlastarray[$j];
1105                                } else {
1106                                    $von    .= ' '.$vonlastarray[$j];
1107                                }
1108                            } else {
1109                                $von    .= ' '.$vonlastarray[$j];
1110                            }
1111                        }
1112                    }
1113                    $last .= ' '.$vonlastarray[$size-1];
1114                }
1115                //Now we check if it is version three (three entries in the array (two commas)
1116                if (3==sizeof($tmparray)) {
1117                    $jr = $tmparray[1];
1118                }
1119                //Everything in the last entry is first
1120                $first = $tmparray[sizeof($tmparray)-1];
1121            }
1122            $authorarray[$i] = array('first'=>trim($first), 'von'=>trim($von), 'last'=>trim($last), 'jr'=>trim($jr));
1123        }
1124        return $authorarray;
1125    }
1126
1127    /**
1128     * Case Determination according to the needs of BibTex
1129     *
1130     * To parse the Author(s) correctly a determination is needed
1131     * to get the Case of a word. There are three possible values:
1132     * - Upper Case (return value 1)
1133     * - Lower Case (return value 0)
1134     * - Caseless   (return value -1)
1135     *
1136     * @access private
1137     * @param string $word
1138     * @return int The Case or PHP_INT_MAX if there was a problem
1139     */
1140    private function _determineCase($word) {
1141        $ret         = -1;
1142        $trimmedword = trim ($word);
1143        /*We need this variable. Without the next of would not work
1144         (trim changes the variable automatically to a string!)*/
1145        if (is_string($word) && (strlen($trimmedword) > 0)) {
1146            $i         = 0;
1147            $found     = false;
1148            $openbrace = 0;
1149            while (!$found && ($i <= strlen($word))) {
1150                $letter = substr($trimmedword, $i, 1);
1151                $ord    = ord($letter);
1152                if ($ord == 123) { //Open brace
1153                    $openbrace++;
1154                }
1155                if ($ord == 125) { //Closing brace
1156                    $openbrace--;
1157                }
1158                if (($ord>=65) && ($ord<=90) && (0==$openbrace)) { //The first character is uppercase
1159                    $ret   = 1;
1160                    $found = true;
1161                } elseif ( ($ord>=97) && ($ord<=122) && (0==$openbrace) ) { //The first character is lowercase
1162                    $ret   = 0;
1163                    $found = true;
1164                } else { //Not yet found
1165                    $i++;
1166                }
1167            }
1168        } else {
1169            $ret = PHP_INT_MAX;
1170//            $ret = PEAR::raiseError('Could not determine case on word: '.(string)$word);
1171        }
1172        return $ret;
1173    }
1174
1175    /**
1176     * Validation of a value
1177     *
1178     * There may be several problems with the value of a field.
1179     * These problems exist but do not break the parsing.
1180     * If a problem is detected a warning is appended to the array warnings.
1181     *
1182     * @access private
1183     * @param string $entry The entry aka one line which which should be validated
1184     * @param string $wholeentry The whole BibTex Entry which the one line is part of
1185     * @return void
1186     */
1187    private function _validateValue($entry, $wholeentry)
1188    {
1189        //There is no @ allowed if the entry is enclosed by braces
1190        if (preg_match('/^{.*@.*}$/', $entry)) {
1191            $this->_generateWarning('WARNING_AT_IN_BRACES', $entry, $wholeentry);
1192        }
1193        //No escaped " allowed if the entry is enclosed by double quotes
1194        if (preg_match('/^\".*\\".*\"$/', $entry)) {
1195            $this->_generateWarning('WARNING_ESCAPED_DOUBLE_QUOTE_INSIDE_DOUBLE_QUOTES', $entry, $wholeentry);
1196        }
1197        //Amount of Braces is not correct
1198        $open     = 0;
1199        $lastchar = '';
1200        $char     = '';
1201        for ($i = 0; $i < strlen($entry); $i++) {
1202            $char = substr($entry, $i, 1);
1203            if (('{' == $char) && ('\\' != $lastchar)) {
1204                $open++;
1205            }
1206            if (('}' == $char) && ('\\' != $lastchar)) {
1207                $open--;
1208            }
1209            $lastchar = $char;
1210        }
1211        if (0 != $open) {
1212            $this->_generateWarning('WARNING_UNBALANCED_AMOUNT_OF_BRACES', $entry, $wholeentry);
1213        }
1214    }
1215
1216    /**
1217     * Remove curly braces from entry
1218     *
1219     * @access private
1220     * @param string $value The value in which curly braces to be removed
1221     * @param string Value with removed curly braces
1222     */
1223    private function _removeCurlyBraces($value)
1224    {
1225        //First we save the delimiters
1226        $beginningdels = array_keys($this->_delimiters);
1227        $firstchar     = substr($value, 0, 1);
1228        $lastchar      = substr($value, -1, 1);
1229        $begin         = '';
1230        $end           = '';
1231        while (in_array($firstchar, $beginningdels)) { //The first character is an opening delimiter
1232            if ($lastchar == $this->_delimiters[$firstchar]) { //Matches to closing Delimiter
1233                $begin .= $firstchar;
1234                $end   .= $lastchar;
1235                $value  = substr($value, 1, -1);
1236            } else {
1237                break;
1238            }
1239            $firstchar = substr($value, 0, 1);
1240            $lastchar  = substr($value, -1, 1);
1241        }
1242        //Now we get rid of the curly braces
1243        $value = preg_replace('/[\{\}]/', '', $value);
1244        //Reattach delimiters
1245        $value       = $begin.$value.$end;
1246        return $value;
1247    }
1248
1249    /**
1250     * Generates a warning
1251     *
1252     * @access private
1253     * @param string $type The type of the warning
1254     * @param string $entry The line of the entry where the warning occurred
1255     * @param string $wholeentry OPTIONAL The whole entry where the warning occurred
1256     */
1257    private function _generateWarning($type, $entry, $wholeentry='')
1258    {
1259        $warning['warning']    = $type;
1260        $warning['entry']      = $entry;
1261        $warning['wholeentry'] = $wholeentry;
1262        $this->warnings[]      = $warning;
1263    }
1264
1265    /**
1266     * Cleares all warnings
1267     *
1268     * @access public
1269     */
1270    public function clearWarnings()
1271    {
1272        $this->warnings = array();
1273    }
1274
1275    /**
1276     * Is there a warning?
1277     *
1278     * @access public
1279     * @return true if there is, false otherwise
1280     */
1281    public function hasWarning()
1282    {
1283        if (sizeof($this->warnings)>0) return true;
1284        else return false;
1285    }
1286
1287    /**
1288     * Returns the author formatted
1289     *
1290     * The Author is formatted as setted in the authorstring
1291     *
1292     * @access private
1293     * @param array $array Author array
1294     * @return string the formatted author string
1295     */
1296    private function _formatAuthor($array)
1297    {
1298        if (!array_key_exists('von', $array)) {
1299            $array['von'] = '';
1300        } else {
1301            $array['von'] = trim($array['von']);
1302        }
1303        if (!array_key_exists('last', $array)) {
1304            $array['last'] = '';
1305        } else {
1306            $array['last'] = trim($array['last']);
1307        }
1308        if (!array_key_exists('jr', $array)) {
1309            $array['jr'] = '';
1310        } else {
1311            $array['jr'] = trim($array['jr']);
1312        }
1313        if (!array_key_exists('first', $array)) {
1314            $array['first'] = '';
1315        } else {
1316            $array['first'] = trim($array['first']);
1317        }
1318        $ret = $this->authorstring;
1319        $ret = str_replace("VON", $array['von'], $ret);
1320        $ret = str_replace("LAST", $array['last'], $ret);
1321        // Assuming that "jr" is always separated by a comma
1322        if (!empty($array['jr'])) {
1323          $ret = str_replace("JR", $array['jr'], $ret);
1324        } else {
1325          $ret = str_replace(", JR", '', $ret);
1326        }
1327        $ret = str_replace("FIRST", $array['first'], $ret);
1328        return trim($ret);
1329    }
1330
1331}
1332?>
1333