1// Copyright (C) 2006 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15
16/**
17 * @fileoverview
18 * some functions for browser-side pretty printing of code contained in html.
19 *
20 * The lexer should work on a number of languages including C and friends,
21 * Java, Python, Bash, SQL, HTML, XML, CSS, Javascript, and Makefiles.
22 * It works passably on Ruby, PHP and Awk and a decent subset of Perl, but,
23 * because of commenting conventions, doesn't work on Smalltalk, Lisp-like, or
24 * CAML-like languages.
25 *
26 * If there's a language not mentioned here, then I don't know it, and don't
27 * know whether it works.  If it has a C-like, Bash-like, or XML-like syntax
28 * then it should work passably.
29 *
30 * Usage:
31 * 1) include this source file in an html page via
32 * <script type="text/javascript" src="/path/to/prettify.js"></script>
33 * 2) define style rules.  See the example page for examples.
34 * 3) mark the <pre> and <code> tags in your source with class=prettyprint.
35 *    You can also use the (html deprecated) <xmp> tag, but the pretty printer
36 *    needs to do more substantial DOM manipulations to support that, so some
37 *    css styles may not be preserved.
38 * That's it.  I wanted to keep the API as simple as possible, so there's no
39 * need to specify which language the code is in.
40 *
41 * Change log:
42 * cbeust, 2006/08/22
43 *   Java annotations (start with "@") are now captured as literals ("lit")
44 */
45
46var PR_keywords = {};
47/** initialize the keyword list for our target languages. */
48(function () {
49  var CPP_KEYWORDS = "abstract bool break case catch char class const " +
50    "const_cast continue default delete deprecated dllexport dllimport do " +
51    "double dynamic_cast else enum explicit extern false float for friend " +
52    "goto if inline int long mutable naked namespace new noinline noreturn " +
53    "nothrow novtable operator private property protected public register " +
54    "reinterpret_cast return selectany short signed sizeof static " +
55    "static_cast struct switch template this thread throw true try typedef " +
56    "typeid typename union unsigned using declaration, directive uuid " +
57    "virtual void volatile while typeof";
58  var CSHARP_KEYWORDS = "as base by byte checked decimal delegate descending " +
59    "event finally fixed foreach from group implicit in interface internal " +
60    "into is lock null object out override orderby params readonly ref sbyte " +
61    "sealed stackalloc string select uint ulong unchecked unsafe ushort var";
62  var JAVA_KEYWORDS = "package synchronized boolean implements import throws " +
63    "instanceof transient extends final strictfp native super";
64  var JSCRIPT_KEYWORDS = "debugger export function with NaN Infinity";
65  var PERL_KEYWORDS = "require sub unless until use elsif BEGIN END";
66  var PYTHON_KEYWORDS = "and assert def del elif except exec global lambda " +
67    "not or pass print raise yield False True None";
68  var RUBY_KEYWORDS = "then end begin rescue ensure module when undef next " +
69    "redo retry alias defined";
70  var SH_KEYWORDS = "done fi";
71
72  var KEYWORDS = [CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS,
73                  JSCRIPT_KEYWORDS, PERL_KEYWORDS, PYTHON_KEYWORDS,
74                  RUBY_KEYWORDS, SH_KEYWORDS];
75  for (var k = 0; k < KEYWORDS.length; k++) {
76    var kw = KEYWORDS[k].split(' ');
77    for (var i = 0; i < kw.length; i++) {
78      if (kw[i]) { PR_keywords[kw[i]] = true; }
79    }
80  }
81}).call(this);
82
83// token style names.  correspond to css classes
84/** token style for a string literal */
85var PR_STRING = 'str';
86/** token style for a keyword */
87var PR_KEYWORD = 'kwd';
88/** token style for a comment */
89var PR_COMMENT = 'com';
90/** token style for a type */
91var PR_TYPE = 'typ';
92/** token style for a literal value.  e.g. 1, null, true. */
93var PR_LITERAL = 'lit';
94/** token style for a punctuation string. */
95var PR_PUNCTUATION = 'pun';
96/** token style for a plain string. */
97var PR_PLAIN = 'pln';
98/** token style for a regexp. */
99var PR_REGEX = 'reg';
100/** token style for a constants. */
101var PR_CONSTANT = 'const';
102
103
104/** token style for an sgml tag. */
105var PR_TAG = 'tag';
106/** token style for a markup declaration such as a DOCTYPE. */
107var PR_DECLARATION = 'dec';
108/** token style for embedded source. */
109var PR_SOURCE = 'src';
110/** token style for an sgml attribute name. */
111var PR_ATTRIB_NAME = 'atn';
112/** token style for an sgml attribute value. */
113var PR_ATTRIB_VALUE = 'atv';
114/** token style for an new line. */
115var PR_NL = 'nl';
116
117/** the number of characters between tab columns */
118var PR_TAB_WIDTH = 8;
119
120/** the position of the end of a token during.  A division of a string into
121  * n tokens can be represented as a series n - 1 token ends, as long as
122  * runs of whitespace warrant their own token.
123  * @private
124  */
125function PR_TokenEnd(end, style) {
126  if (undefined === style) { throw new Error('BAD'); }
127  if ('number' != typeof(end)) { throw new Error('BAD'); }
128  this.end = end;
129  this.style = style;
130}
131PR_TokenEnd.prototype.toString = function () {
132  return '[PR_TokenEnd ' + this.end +
133    (this.style ? ':' + this.style : '') + ']';
134};
135
136
137/** a chunk of text with a style.  These are used to represent both the output
138  * from the lexing functions as well as intermediate results.
139  * @constructor
140  * @param token the token text
141  * @param style one of the token styles defined in designdoc-template, or null
142  *   for a styleless token, such as an embedded html tag.
143  * @private
144  */
145function PR_Token(token, style) {
146  if (undefined === style) { throw new Error('BAD'); }
147  this.token = token;
148  this.style = style;
149}
150
151PR_Token.prototype.toString = function () {
152  return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']';
153};
154
155
156/** a helper class that decodes common html entities used to escape special
157  * characters in source code.
158  * @constructor
159  * @private
160  */
161function PR_DecodeHelper() {
162  this.next = 0;
163  this.ch = '\0';
164}
165
166var PR_NAMED_ENTITIES = {
167  'lt':   '<',
168  'gt':   '>',
169  'quot': '"',
170  'apos': "'",
171  'amp':  '&'   // reencoding requires that & always be decoded properly
172};
173
174PR_DecodeHelper.prototype.decode = function (s, i) {
175  var next = i + 1;
176  var ch = s.charAt(i);
177  if ('&' === ch) {
178    var semi = s.indexOf(';', next);
179    if (semi >= 0 && semi < next + 4) {
180      var entityName = s.substring(next, semi);
181      var decoded = null;
182      if (entityName.charAt(0) === '#') {  // check for numeric entity
183        var ch1 = entityName.charAt(1);
184        var charCode;
185        if (ch1 === 'x' || ch1 === 'X') {  // like &#xA0;
186          charCode = parseInt(entityName.substring(2), 16);
187        } else {  // like &#160;
188          charCode = parseInt(entityName.substring(1), 10);
189        }
190        if (!isNaN(charCode)) {
191          decoded = String.fromCharCode(charCode);
192        }
193      }
194      if (!decoded) {
195        decoded = PR_NAMED_ENTITIES[entityName.toLowerCase()];
196      }
197      if (decoded) {
198        ch = decoded;
199        next = semi + 1;
200      } else {  // skip over unrecognized entity
201        next = i + 1;
202        ch = '\0';
203      }
204    }
205  }
206  this.next = next;
207  this.ch = ch;
208  return this.ch;
209};
210
211
212// some string utilities
213function PR_isWordChar(ch) {
214  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
215}
216
217function PR_isIdentifierStart(ch) {
218  return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@';
219}
220
221function PR_isIdentifierPart(ch) {
222  return PR_isIdentifierStart(ch) || PR_isDigitChar(ch);
223}
224
225function PR_isSpaceChar(ch) {
226  return "\t \r\n".indexOf(ch) >= 0;
227}
228
229function PR_isDigitChar(ch) {
230  return ch >= '0' && ch <= '9';
231}
232
233function PR_trim(s) {
234  var i = 0, j = s.length - 1;
235  while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; }
236  while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; }
237  return s.substring(i, j + 1);
238}
239
240function PR_startsWith(s, prefix) {
241  return s.length >= prefix.length && prefix == s.substring(0, prefix.length);
242}
243
244function PR_endsWith(s, suffix) {
245  return s.length >= suffix.length &&
246         suffix == s.substring(s.length - suffix.length, s.length);
247}
248
249/** a set of tokens that can precede a regular expression literal in javascript.
250  * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full
251  * list, but I've removed ones that might be problematic when seen in languages
252  * that don't support regular expression literals.
253  *
254  * <p>Specifically, I've removed any keywords that can't precede a regexp
255  * literal in a syntactically legal javascript program, and I've removed the
256  * "in" keyword since it's not a keyword in many languages, and might be used
257  * as a count of inches.
258  * @private
259  */
260var REGEXP_PRECEDER_PATTERN = (function () {
261    var preceders = [
262        "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=",
263        "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=",
264        "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";",
265        "<", "<<", "<<=", "<=", "=", "==", "===", ">",
266        ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[",
267        "^", "^=", "^^", "^^=", "{", "|", "|=", "||",
268        "||=", "~", "break", "case", "continue", "delete",
269        "do", "else", "finally", "instanceof",
270        "return", "throw", "try", "typeof"
271        ];
272    var pattern = '(?:' +
273      '(?:(?:^|[^0-9\.])\\.{1,3})|' +  // a dot that's not part of a number
274      '(?:(?:^|[^\\+])\\+)|' +  // allow + but not ++
275      '(?:(?:^|[^\\-])-)'  // allow - but not --
276      ;
277    for (var i = 0; i < preceders.length; ++i) {
278      var preceder = preceders[i];
279      if (PR_isWordChar(preceder.charAt(0))) {
280        pattern += '|\\b' + preceder;
281      } else {
282        pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1');
283      }
284    }
285    pattern += ')\\s*$';  // matches at end
286    return new RegExp(pattern);
287    // CAVEAT: this does not properly handle the case where a regular expression
288    // immediately follows another since a regular expression may have flags
289    // for case-sensitivity and the like.  Having regexp tokens adjacent is not
290    // valid in any language I'm aware of, so I'm punting.
291    // TODO: maybe style special characters inside a regexp as punctuation.
292  })();
293
294/** true iff prefix matches the first prefix characters in chars[0:len].
295  * @private
296  */
297function PR_prefixMatch(chars, len, prefix) {
298  if (len < prefix.length) { return false; }
299  for (var i = 0, n = prefix.length; i < n; ++i) {
300    if (prefix.charAt(i) != chars[i]) { return false; }
301  }
302  return true;
303}
304
305/** like textToHtml but escapes double quotes to be attribute safe. */
306function PR_attribToHtml(str) {
307  return str.replace(/&/g, '&amp;')
308    .replace(/</g, '&lt;')
309    .replace(/>/g, '&gt;')
310    .replace(/\"/g, '&quot;')
311    .replace(/\xa0/, '&nbsp;');
312}
313
314/** escapest html special characters to html. */
315function PR_textToHtml(str) {
316  return str.replace(/&/g, '&amp;')
317    .replace(/</g, '&lt;')
318    .replace(/>/g, '&gt;')
319    .replace(/\xa0/g, '&nbsp;');
320}
321
322/** is the given node's innerHTML normally unescaped? */
323function PR_isRawContent(node) {
324  return 'XMP' == node.tagName;
325}
326
327var PR_innerHtmlWorks = null;
328function PR_getInnerHtml(node) {
329  // inner html is hopelessly broken in Safari 2.0.4 when the content is
330  // an html description of well formed XML and the containing tag is a PRE
331   // tag, so we detect that case and emulate innerHTML.
332  if (null == PR_innerHtmlWorks) {
333    var testNode = document.createElement('PRE');
334    testNode.appendChild(
335        document.createTextNode('<!DOCTYPE foo PUBLIC "foo bar">\n<foo />'));
336    PR_innerHtmlWorks = !/</.test(testNode.innerHTML);
337  }
338
339  if (PR_innerHtmlWorks) {
340    var content = node.innerHTML;
341    // XMP tags contain unescaped entities so require special handling.
342    if (PR_isRawContent(node)) {
343       content = PR_textToHtml(content);
344    }
345    return content;
346  }
347
348  var out = [];
349  for (var child = node.firstChild; child; child = child.nextSibling) {
350    PR_normalizedHtml(child, out);
351  }
352  return out.join('');
353}
354
355/**
356 * walks the DOM returning a properly escaped version of innerHTML.
357 */
358function PR_normalizedHtml(node, out) {
359  switch (node.nodeType) {
360    case 1:  // an element
361      var name = node.tagName.toLowerCase();
362      out.push('\074', name);
363      for (var i = 0; i < node.attributes.length; ++i) {
364        var attr = node.attributes[i];
365        if (!attr.specified) { continue; }
366        out.push(' ');
367        PR_normalizedHtml(attr, out);
368      }
369      out.push('>');
370      for (var child = node.firstChild; child; child = child.nextSibling) {
371        PR_normalizedHtml(child, out);
372      }
373      if (node.firstChild || !/^(?:br|link|img)$/.test(name)) {
374        out.push('<\/', name, '>');
375      }
376      break;
377    case 2: // an attribute
378      out.push(node.name.toLowerCase(), '="', PR_attribToHtml(node.value), '"');
379      break;
380    case 3: case 4: // text
381      out.push(PR_textToHtml(node.nodeValue));
382      break;
383  }
384}
385
386/** expand tabs to spaces
387  * @param {Array} chunks PR_Tokens possibly containing tabs
388  * @param {Number} tabWidth number of spaces between tab columns
389  * @return {Array} chunks with tabs replaced with spaces
390  */
391function PR_expandTabs(chunks, tabWidth) {
392  var SPACES = '                ';
393
394  var charInLine = 0;
395  var decodeHelper = new PR_DecodeHelper();
396
397  var chunksOut = []
398  for (var chunkIndex = 0; chunkIndex < chunks.length; ++chunkIndex) {
399    var chunk = chunks[chunkIndex];
400    if (chunk.style == null) {
401      chunksOut.push(chunk);
402      continue;
403    }
404
405    var s = chunk.token;
406    var pos = 0;  // index of last character output
407    var out = [];
408
409    // walk over each character looking for tabs and newlines.
410    // On tabs, expand them.  On newlines, reset charInLine.
411    // Otherwise increment charInLine
412    for (var charIndex = 0, n = s.length; charIndex < n;
413         charIndex = decodeHelper.next) {
414      decodeHelper.decode(s, charIndex);
415      var ch = decodeHelper.ch;
416
417      switch (ch) {
418        case '\t':
419          out.push(s.substring(pos, charIndex));
420          // calculate how much space we need in front of this part
421          // nSpaces is the amount of padding -- the number of spaces needed to
422          // move us to the next column, where columns occur at factors of
423          // tabWidth.
424          var nSpaces = tabWidth - (charInLine % tabWidth);
425          charInLine += nSpaces;
426          for (; nSpaces >= 0; nSpaces -= SPACES.length) {
427            out.push(SPACES.substring(0, nSpaces));
428          }
429          pos = decodeHelper.next;
430          break;
431        case '\n': case '\r':
432          charInLine = 0;
433          break;
434        default:
435          ++charInLine;
436      }
437    }
438    out.push(s.substring(pos));
439    chunksOut.push(new PR_Token(out.join(''), chunk.style));
440  }
441  return chunksOut
442}
443
444/** split markup into chunks of html tags (style null) and
445  * plain text (style {@link #PR_PLAIN}).
446  *
447  * @param {String} s html.
448  * @return {Array} of PR_Tokens of style PR_PLAIN, and null.
449  * @private
450  */
451function PR_chunkify(s) {
452  // The below pattern matches one of the following
453  // (1) /[^<]+/ : A run of characters other than '<'
454  // (2) /<\/?[a-zA-Z][^>]*>/ : A probably tag that should not be highlighted
455  // (3) /</ : A '<' that does not begin a larger chunk.  Treated as 1
456  var chunkPattern = /(?:[^<]+|<\/?[a-zA-Z][^>]*>|<)/g;
457  // since the pattern has the 'g' modifier and defines no capturing groups,
458  // this will return a list of all chunks which we then classify and wrap as
459  // PR_Tokens
460  var matches = s.match(chunkPattern);
461  var chunks = [];
462  if (matches) {
463    var lastChunk = null;
464    for (var i = 0, n = matches.length; i < n; ++i) {
465      var chunkText = matches[i];
466      var style;
467      if (chunkText.length < 2 || chunkText.charAt(0) !== '<') {
468        if (lastChunk && lastChunk.style === PR_PLAIN) {
469          lastChunk.token += chunkText;
470          continue;
471        }
472        style = PR_PLAIN;
473      } else {  // a tag
474        style = null;
475      }
476      lastChunk = new PR_Token(chunkText, style);
477      chunks.push(lastChunk);
478    }
479  }
480  return chunks;
481}
482
483/** walk the tokenEnds list and the chunk list in parallel to generate a list
484  * of split tokens.
485  * @private
486  */
487function PR_splitChunks(chunks, tokenEnds) {
488  var tokens = [];  // the output
489
490  var ci = 0;  // index into chunks
491  // position of beginning of amount written so far in absolute space.
492  var posAbs = 0;
493  // position of amount written so far in chunk space
494  var posChunk = 0;
495
496  // current chunk
497  var chunk = new PR_Token('', null);
498  for (var ei = 0, ne = tokenEnds.length, lastEnd = 0; ei < ne; ++ei) {
499    var tokenEnd = tokenEnds[ei];
500    var end = tokenEnd.end;
501    if (end === lastEnd) { continue; }  // skip empty regions
502
503    var tokLen = end - posAbs;
504    var remainingInChunk = chunk.token.length - posChunk;
505    while (remainingInChunk <= tokLen) {
506      if (remainingInChunk > 0) {
507        tokens.push(
508            new PR_Token(chunk.token.substring(posChunk, chunk.token.length),
509                         null == chunk.style ? null : tokenEnd.style));
510      }
511      posAbs += remainingInChunk;
512      posChunk = 0;
513      if (ci < chunks.length) {
514        chunk = chunks[ci++];
515      }
516
517      tokLen = end - posAbs;
518      remainingInChunk = chunk.token.length - posChunk;
519    }
520
521    if (tokLen) {
522      tokens.push(
523          new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen),
524                       tokenEnd.style));
525      posAbs += tokLen;
526      posChunk += tokLen;
527    }
528  }
529
530  return tokens;
531}
532
533/** splits markup tokens into declarations, tags, and source chunks.
534  * @private
535  */
536function PR_splitMarkup(chunks) {
537  // A state machine to split out declarations, tags, etc.
538  // This state machine deals with absolute space in the text, indexed by k,
539  // and position in the current chunk, indexed by pos and tokenStart to
540  // generate a list of the ends of tokens.
541  // Absolute space is calculated by considering the chunks as appended into
542  // one big string, as they were before being split.
543
544  // Known failure cases
545  // Server side scripting sections such as <?...?> in attributes.
546  // i.e. <span class="<? foo ?>">
547  // Handling this would require a stack, and we don't use PHP.
548
549  // The output: a list of pairs of PR_TokenEnd instances
550  var tokenEnds = [];
551
552  var state = 0;  // FSM state variable
553  var k = 0;  // position in absolute space of the start of the current chunk
554  var tokenStart = -1;  // the start of the current token
555
556  // Try to find a closing tag for any open <style> or <script> tags
557  // We can't do this at a later stage because then the following case
558  // would fail:
559  // <script>document.writeln('<!--');</script>
560
561  // We use tokenChars[:tokenCharsI] to accumulate the tag name so that we
562  // can check whether to enter into a no scripting section when the tag ends.
563  var tokenChars = new Array(12);
564  var tokenCharsI = 0;
565  // if non null, the tag prefix that we need to see to break out.
566  var endScriptTag = null;
567  var decodeHelper = new PR_DecodeHelper();
568
569  for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
570    var chunk = chunks[ci];
571    if (PR_PLAIN != chunk.style) {
572      k += chunk.token.length;
573      continue;
574    }
575    var s = chunk.token;
576    var pos = 0;  // the position past the last character processed so far in s
577
578    for (var i = 0, n = s.length; i < n; /* i = next at bottom */) {
579      decodeHelper.decode(s, i);
580      var ch = decodeHelper.ch;
581      var next = decodeHelper.next;
582
583      var tokenStyle = null;
584      switch (state) {
585        case 0:
586          if ('<' == ch) { state = 1; }
587          if (ch.match(/[\r\n]/)) { state = 13; }
588          break;
589        case 1:
590          tokenCharsI = 0;
591          if ('/' == ch) {  // only consider close tags if we're in script/style
592            state = 7;
593          } else if (null == endScriptTag) {
594            if ('!' == ch) {
595              state = 2;
596            } else if (PR_isWordChar(ch)) {
597              state = 8;
598            } else if ('?' == ch) {
599              state = 9;
600            } else if ('%' == ch) {
601              state = 11;
602            } else if ('<' != ch) {
603              state = 0;
604            }
605          } else if ('<' != ch) {
606            state = 0;
607          }
608          break;
609        case 2:
610          if ('-' == ch) {
611            state = 4;
612          } else if (PR_isWordChar(ch)) {
613            state = 3;
614          } else if ('<' == ch) {
615            state = 1;
616          } else {
617            state = 0;
618          }
619          break;
620        case 3:
621          if ('>' == ch) {
622            state = 0;
623            tokenStyle = PR_DECLARATION;
624          }
625          break;
626        case 4:
627          if ('-' == ch) { state = 5; }
628          break;
629        case 5:
630          if ('-' == ch) { state = 6; }
631          break;
632        case 6:
633          if ('>' == ch) {
634            state = 0;
635            tokenStyle = PR_COMMENT;
636          } else if ('-' == ch) {
637            state = 6;
638          } else {
639            state = 4;
640          }
641          break;
642        case 7:
643          if (PR_isWordChar(ch)) {
644            state = 8;
645          } else if ('<' == ch) {
646            state = 1;
647          } else {
648            state = 0;
649          }
650          break;
651        case 8:
652          if ('>' == ch) {
653            state = 0;
654            tokenStyle = PR_TAG;
655          }
656          break;
657        case 9:
658          if ('?' == ch) { state = 10; }
659          break;
660        case 10:
661          if ('>' == ch) {
662            state = 0;
663            tokenStyle = PR_SOURCE;
664          } else if ('?' != ch) {
665            state = 9;
666          }
667          break;
668        case 11:
669          if ('%' == ch) { state = 12; }
670          break;
671        case 12:
672          if ('>' == ch) {
673            state = 0;
674            tokenStyle = PR_SOURCE;
675          } else if ('%' != ch) {
676            state = 11;
677          }
678          break;
679        case 13:
680          tokenCharsI = 0;
681          if (!ch.match(/[\r\n]/)) {
682              state = 0;
683              tokenStyle = PR_NL;
684              next = i;
685          }
686          break;
687      }
688
689      if (tokenCharsI < tokenChars.length) {
690        tokenChars[tokenCharsI++] = ch.toLowerCase();
691      }
692      if (1 == state || 13 == state) { tokenStart = k + i; }
693      i = next;
694      if (tokenStyle != null) {
695        if (null != tokenStyle) {
696          if (endScriptTag) {
697            if (PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag)) {
698              endScriptTag = null;
699            }
700          } else {
701            if (PR_prefixMatch(tokenChars, tokenCharsI, 'script')) {
702              endScriptTag = '/script';
703            } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'style')) {
704              endScriptTag = '/style';
705            } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'xmp')) {
706              endScriptTag = '/xmp';
707            }
708          }
709          // disallow the tag if endScriptTag is set and this was not an open
710          // tag.
711          if (endScriptTag && tokenCharsI && '/' == tokenChars[0]) {
712            tokenStyle = null;
713          }
714        }
715        if (null != tokenStyle) {
716//          PR_splitNonStringNonCommentToken(tokenStart,tokenEnds);
717//          alert(tokenStart+" "+tokenStyle);
718          tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN));
719//          alert(tokenEnds)
720//          alert(tokenStyle+"\n"+tokenStart+"\n"+tokenEnds)
721          tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle));
722        }
723      }
724    }
725    k += chunk.token.length;
726  }
727  tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN));
728
729  return tokenEnds;
730}
731
732/** splits the given string into comment, string, and "other" tokens.
733  * @return {Array} of PR_Tokens with style in
734  *   (PR_STRING, PR_COMMENT, PR_PLAIN, null)
735  *   The result array may contain spurious zero length tokens.  Ignore them.
736  *
737  * @private
738  */
739function PR_splitStringAndCommentTokens(chunks) {
740  // a state machine to split out comments, strings, and other stuff
741  var tokenEnds = [];  // positions of ends of tokens in absolute space
742  var state = 0;  // FSM state variable
743  var delim = -1;  // string delimiter
744  var k = 0;  // absolute position of beginning of current chunk
745  var lookBehind = [];  // the last 16 characters processed collapsing space
746  var lastCh = '';
747
748  for (var ci = 0, nc = chunks.length; ci < nc; ++ci) {
749    var chunk = chunks[ci];
750    var s = chunk.token;
751    if (PR_PLAIN == chunk.style) {
752      var decodeHelper = new PR_DecodeHelper();
753      var last = -1;
754      var next;
755      for (var i = 0, n = s.length; i < n; last = i, i = next) {
756        decodeHelper.decode(s, i);
757        var ch = decodeHelper.ch;
758        next = decodeHelper.next;
759        if (0 == state) {
760          if (ch == '"' || ch == '\'' || ch == '`') {
761            tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
762            state = 1;
763            delim = ch;
764          } else if (ch == '/') {
765            state = 3;
766          } else if (ch == '#') {
767            state = 4;
768            tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN));
769          }
770        } else if (1 == state) {
771          if (ch == delim) {
772            state = 0;
773            tokenEnds.push(new PR_TokenEnd(k + next, '/'==ch?PR_REGEX:PR_STRING));
774          } else if (ch == '\\') {
775            state = 2;
776          }
777        } else if (2 == state) {
778          state = 1;
779        } else if (3 == state) {
780          if (ch == '/') {
781            state = 4;
782            tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
783          } else if (ch == '*') {
784            state = 5;
785            tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
786          } else {
787            // check the last token and see if we should treat this as the start
788            // of a regular expression literal.
789            if ((!lookBehind.length ||
790                 REGEXP_PRECEDER_PATTERN.test(lookBehind.join('')))) {
791              // treat regular expression as a string with delimiter /
792              state = 1;
793              delim = '/';
794              tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN));
795            } else {
796              state = 0;
797              // next loop will reenter state 0 without same value of i, so
798              // ch will be reconsidered as start of new token.
799              next = i;
800              continue;
801            }
802          }
803        } else if (4 == state) {
804          if (ch == '\r' || ch == '\n') {
805            state = 0;
806            tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT));
807          }
808        } else if (5 == state) {
809          if (ch == '*') {
810            state = 6;
811          }
812        } else if (6 == state) {
813          if (ch == '/') {
814            state = 0;
815            tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT));
816            continue;  // skip lookbehind
817          } else if (ch != '*') {
818            state = 5;
819          }
820        }
821
822        // push char on lookbehind if it's not a comment token.  Don't
823        // waste space with lots of space ; just leave enough to indicate
824        // boundaries.
825        if (3 > state || state > 6) {
826          var isSpace = PR_isSpaceChar(ch);
827          if (!(lastCh === ' ' && isSpace)) {
828            if (lookBehind.length > 16) { lookBehind.shift(); }
829            lastCh = isSpace ? ' ' : ch;
830            lookBehind.push(lastCh);
831          }
832        }
833      }
834    }
835    k += s.length;
836  }
837  var endTokenType;
838  switch (state) {
839    case 1: case 2:
840      endTokenType = PR_STRING;
841      break;
842    case 4: case 5: case 6:
843      endTokenType = PR_COMMENT;
844      break;
845    default:
846      endTokenType = PR_PLAIN;
847      break;
848  }
849  // handle unclosed token which can legally happen for line comments (state 4)
850  tokenEnds.push(new PR_TokenEnd(k, endTokenType));  // a token ends at the end
851
852  return PR_splitChunks(chunks, tokenEnds);
853}
854
855/** used by lexSource to split a non string, non comment token.
856  * @private
857  */
858function PR_splitNonStringNonCommentToken(s, outlist) {
859  var pos = 0;
860  var state = 0;
861
862  var decodeHelper = new PR_DecodeHelper();
863  var next;
864  for (var i = 0; i <= s.length; i = next) {
865    if (i == s.length) {
866      // nstate will not be equal to state, so it will append the token
867      nstate = -2;
868      next = i + 1;
869    } else {
870      decodeHelper.decode(s, i);
871      next = decodeHelper.next;
872      var ch = decodeHelper.ch;
873
874      // the next state.
875      // if set to -1 then it will cause a reentry to state 0 without consuming
876      // another character.
877      var nstate = state;
878
879      switch (state) {
880      case 0:  // whitespace state
881        if (PR_isIdentifierStart(ch)) {
882          nstate = 1;
883        } else if (PR_isDigitChar(ch)) {
884          nstate = 2;
885        } else if (ch.match(/[\r\n]/)) {
886          nstate = 3;
887        } else if (!PR_isSpaceChar(ch)) {
888          nstate = 3;
889        }
890        if (nstate && pos < i) {
891          var t = s.substring(pos, i);
892          outlist.push(new PR_Token(t, PR_PLAIN));
893          pos = i;
894        }
895        break;
896      case 1:  // identifier state
897        if (!PR_isIdentifierPart(ch)) {
898          nstate = -1;
899        }
900        break;
901      case 2:  // number literal state
902        // handle numeric literals like
903        // 0x7f 300UL 100_000
904
905        // this does not treat floating point values as a single literal
906        //   0.1 and 3e-6
907        // are each split into multiple tokens
908        if (!(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_')) {
909          nstate = -1;
910        }
911        break;
912      case 3:  // punctuation state
913        if (PR_isIdentifierStart(ch) || PR_isDigitChar(ch) ||
914            PR_isSpaceChar(ch)) {
915          nstate = -1;
916        }
917        break;
918      }
919    }
920
921    if (nstate != state) {
922      if (nstate < 0) {
923        if (i > pos) {
924          var t = s.substring(pos, i);
925          var wordDecodeHelper = new PR_DecodeHelper();
926          wordDecodeHelper.decode(t, 0);
927          var ch0 = wordDecodeHelper.ch;
928          var isSingleCharacter = wordDecodeHelper.next == t.length;
929          var style;
930          if (PR_isIdentifierStart(ch0)) {
931            if (PR_keywords[t]) {
932              style = PR_KEYWORD;
933            } else if (ch0 === '@') {
934              style = PR_LITERAL;
935            } else {
936              // Treat any word that starts with an uppercase character and
937              // contains at least one lowercase character as a type, or
938              // ends with _t.
939              // This works perfectly for Java, pretty well for C++, and
940              // passably for Python.  The _t catches C structs.
941              var isType = false;
942              if (ch0 >= 'A' && ch0 <= 'Z') {
943                for (var j = wordDecodeHelper.next;
944                     j < t.length; j = wordDecodeHelper.next) {
945                  wordDecodeHelper.decode(t, j);
946                  var ch1 = wordDecodeHelper.ch;
947                  if (ch1 >= 'a' && ch1 <= 'z') {
948                    isType = true;
949                    break;
950                  }
951                }
952                if (!isType && !isSingleCharacter &&
953                    t.substring(t.length - 2) == '_t') {
954                  isType = true;
955                }
956              }
957              style = isType ? PR_TYPE
958                             : t==t.toUpperCase()?PR_CONSTANT
959                                                 :PR_PLAIN;
960            }
961          } else if (PR_isDigitChar(ch0)) {
962            style = PR_LITERAL;
963          } else if (!PR_isSpaceChar(ch0)) {
964            style = PR_PUNCTUATION;
965          } else if (ch0.match(/[\r\n]/)) {
966            style = PR_NL;
967          } else {
968            style = PR_PLAIN;
969          }
970          pos = i;
971          outlist.push(new PR_Token(t, style));
972        }
973
974        state = 0;
975        if (nstate == -1) {
976          // don't increment.  This allows us to use state 0 to redispatch based
977          // on the current character.
978          next = i;
979          continue;
980        }
981      }
982      state = nstate;
983    }
984  }
985
986}
987
988/** split a group of chunks of markup.
989  * @private
990  */
991function PR_tokenizeMarkup(chunks) {
992  if (!(chunks && chunks.length)) { return chunks; }
993
994  var tokenEnds = PR_splitMarkup(chunks);
995  return PR_splitChunks(chunks, tokenEnds);
996}
997
998/** split tags attributes and their values out from the tag name, and
999  * recursively lex source chunks.
1000  * @private
1001  */
1002function PR_splitTagAttributes(tokens) {
1003  var tokensOut = [];
1004  var state = 0;
1005  var stateStyle = PR_TAG;
1006  var delim = null;  // attribute delimiter for quoted value state.
1007  var decodeHelper = new PR_DecodeHelper();
1008  for (var ci = 0; ci < tokens.length; ++ci) {
1009    var tok = tokens[ci];
1010    if (PR_TAG == tok.style) {
1011      var s = tok.token;
1012      var start = 0;
1013      for (var i = 0; i < s.length; /* i = next at bottom */) {
1014        decodeHelper.decode(s, i);
1015        var ch = decodeHelper.ch;
1016        var next = decodeHelper.next;
1017
1018        var emitEnd = null;  // null or position of end of chunk to emit.
1019        var nextStyle = null;  // null or next value of stateStyle
1020        if (ch == '>') {
1021          if (PR_TAG != stateStyle) {
1022            emitEnd = i;
1023            nextStyle = PR_TAG;
1024          }
1025        } else {
1026          switch (state) {
1027            case 0:
1028              if ('<' == ch) { state = 1; }
1029              break;
1030            case 1:
1031              if (PR_isSpaceChar(ch)) { state = 2; }
1032              break;
1033            case 2:
1034              if (!PR_isSpaceChar(ch)) {
1035                nextStyle = PR_ATTRIB_NAME;
1036                emitEnd = i;
1037                state = 3;
1038              }
1039              break;
1040            case 3:
1041              if ('=' == ch) {
1042                emitEnd = i;
1043                nextStyle = PR_TAG;
1044                state = 5;
1045              } else if (PR_isSpaceChar(ch)) {
1046                emitEnd = i;
1047                nextStyle = PR_TAG;
1048                state = 4;
1049              }
1050              break;
1051            case 4:
1052              if ('=' == ch) {
1053                state = 5;
1054              } else if (!PR_isSpaceChar(ch)) {
1055                emitEnd = i;
1056                nextStyle = PR_ATTRIB_NAME;
1057                state = 3;
1058              }
1059              break;
1060            case 5:
1061              if ('"' == ch || '\'' == ch) {
1062                emitEnd = i;
1063                nextStyle = PR_ATTRIB_VALUE;
1064                state = 6;
1065                delim = ch;
1066              } else if (!PR_isSpaceChar(ch)) {
1067                emitEnd = i;
1068                nextStyle = PR_ATTRIB_VALUE;
1069                state = 7;
1070              }
1071              break;
1072            case 6:
1073              if (ch == delim) {
1074                emitEnd = next;
1075                nextStyle = PR_TAG;
1076                state = 2;
1077              }
1078              break;
1079            case 7:
1080              if (PR_isSpaceChar(ch)) {
1081                emitEnd = i;
1082                nextStyle = PR_TAG;
1083                state = 2;
1084              }
1085              break;
1086          }
1087        }
1088        if (emitEnd) {
1089          if (emitEnd > start) {
1090            tokensOut.push(
1091                new PR_Token(s.substring(start, emitEnd), stateStyle));
1092            start = emitEnd;
1093          }
1094          stateStyle = nextStyle;
1095        }
1096        i = next;
1097      }
1098      if (s.length > start) {
1099        tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle));
1100      }
1101    } else {
1102      if (tok.style) {
1103        state = 0;
1104        stateStyle = PR_TAG;
1105      }
1106      tokensOut.push(tok);
1107    }
1108  }
1109  return tokensOut;
1110}
1111
1112/** identify regions of markup that are really source code, and recursivley
1113  * lex them.
1114  * @private
1115  */
1116function PR_splitSourceNodes(tokens) {
1117  var tokensOut = [];
1118  // when we see a <script> tag, store '/' here so that we know to end the
1119  // source processing
1120  var endScriptTag = null;
1121  var decodeHelper = new PR_DecodeHelper();
1122
1123  var sourceChunks = null;
1124
1125  for (var ci = 0, nc = tokens.length; /* break below */; ++ci) {
1126    var tok;
1127
1128    if (ci < nc) {
1129      tok = tokens[ci];
1130      if (null == tok.style) {
1131        tokens.push(tok);
1132        continue;
1133      }
1134    } else if (!endScriptTag) {
1135      break;
1136    } else {
1137      // else pretend there's an end tag so we can gracefully handle
1138      // unclosed source blocks
1139      tok = new PR_Token('', null);
1140    }
1141
1142    var s = tok.token;
1143
1144    if (null == endScriptTag) {
1145      if (PR_SOURCE == tok.style) {
1146        // split off any starting and trailing <?, <%
1147        if ('<' == decodeHelper.decode(s, 0)) {
1148          decodeHelper.decode(s, decodeHelper.next);
1149          if ('%' == decodeHelper.ch || '?' == decodeHelper.ch) {
1150            endScriptTag = decodeHelper.ch;
1151            tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next),
1152                                        PR_TAG));
1153            s = s.substring(decodeHelper.next, s.length);
1154          }
1155        }
1156      } else if (PR_TAG == tok.style) {
1157        if ('<' == decodeHelper.decode(s, 0) &&
1158            '/' != s.charAt(decodeHelper.next)) {
1159          var tagContent = s.substring(decodeHelper.next).toLowerCase();
1160          // FIXME(msamuel): this does not mirror exactly the code in
1161          // in PR_splitMarkup that defers splitting tags inside script and
1162          // style blocks.
1163          if (PR_startsWith(tagContent, 'script') ||
1164              PR_startsWith(tagContent, 'style') ||
1165              PR_startsWith(tagContent, 'xmp')) {
1166            endScriptTag = '/';
1167          }
1168        }
1169      }
1170    }
1171
1172    if (null != endScriptTag) {
1173      var endTok = null;
1174      if (PR_SOURCE == tok.style) {
1175        if (endScriptTag == '%' || endScriptTag == '?') {
1176          var pos = s.lastIndexOf(endScriptTag);
1177          if (pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) &&
1178              s.length == decodeHelper.next) {
1179            endTok = new PR_Token(s.substring(pos, s.length), PR_TAG);
1180            s = s.substring(0, pos);
1181          }
1182        }
1183        if (null == sourceChunks) { sourceChunks = []; }
1184        sourceChunks.push(new PR_Token(s, PR_PLAIN));
1185      } else if (PR_PLAIN == tok.style) {
1186        if (null == sourceChunks) { sourceChunks = []; }
1187        sourceChunks.push(tok);
1188      } else if (PR_TAG == tok.style) {
1189        // if it starts with </ then it must be the end tag.
1190        if ('<' == decodeHelper.decode(tok.token, 0) &&
1191            tok.token.length > decodeHelper.next &&
1192            '/' == decodeHelper.decode(tok.token, decodeHelper.next)) {
1193          endTok = tok;
1194        } else {
1195          tokensOut.push(tok);
1196        }
1197      } else if (ci >= nc) {
1198        // force the token to close
1199        endTok = tok;
1200      } else {
1201        if (sourceChunks) {
1202          sourceChunks.push(tok);
1203        } else {
1204          // push remaining tag and attribute tokens from the opening tag
1205          tokensOut.push(tok);
1206        }
1207      }
1208      if (endTok) {
1209        if (sourceChunks) {
1210          var sourceTokens = PR_lexSource(sourceChunks);
1211          tokensOut.push(new PR_Token('<span class=embsrc>', null));
1212          for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
1213            tokensOut.push(sourceTokens[si]);
1214          }
1215          tokensOut.push(new PR_Token('</span>', null));
1216          sourceChunks = null;
1217        }
1218        if (endTok.token) { tokensOut.push(endTok); }
1219        endScriptTag = null;
1220      }
1221    } else {
1222      tokensOut.push(tok);
1223    }
1224  }
1225  return tokensOut;
1226}
1227
1228/** splits the quotes from an attribute value.
1229  * ['"foo"'] -> ['"', 'foo', '"']
1230  * @private
1231  */
1232function PR_splitAttributeQuotes(tokens) {
1233  var firstPlain = null, lastPlain = null;
1234  for (var i = 0; i < tokens.length; ++i) {
1235    if (PR_PLAIN == tokens[i].style) {
1236      firstPlain = i;
1237      break;
1238    }
1239  }
1240  for (var i = tokens.length; --i >= 0;) {
1241    if (PR_PLAIN == tokens[i].style) {
1242      lastPlain = i;
1243      break;
1244    }
1245  }
1246  if (null == firstPlain) { return tokens; }
1247
1248  var decodeHelper = new PR_DecodeHelper();
1249  var fs = tokens[firstPlain].token;
1250  var fc = decodeHelper.decode(fs, 0);
1251  if ('"' != fc && '\'' != fc) {
1252    return tokens;
1253  }
1254  var fpos = decodeHelper.next;
1255
1256  var ls = tokens[lastPlain].token;
1257  var lpos = ls.lastIndexOf('&');
1258  if (lpos < 0) { lpos = ls.length - 1; }
1259  var lc = decodeHelper.decode(ls, lpos);
1260  if (lc != fc || decodeHelper.next != ls.length) {
1261    lc = null;
1262    lpos = ls.length;
1263  }
1264
1265  var tokensOut = [];
1266  for (var i = 0; i < firstPlain; ++i) {
1267    tokensOut.push(tokens[i]);
1268  }
1269  tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE));
1270  if (lastPlain == firstPlain) {
1271    tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN));
1272  } else {
1273    tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN));
1274    for (var i = firstPlain + 1; i < lastPlain; ++i) {
1275      tokensOut.push(tokens[i]);
1276    }
1277    if (lc) {
1278      tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN));
1279    } else {
1280      tokens.push(tokens[lastPlain]);
1281    }
1282  }
1283  if (lc) {
1284    tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN));
1285  }
1286  for (var i = lastPlain + 1; i < tokens.length; ++i) {
1287    tokensOut.push(tokens[i]);
1288  }
1289  return tokensOut;
1290}
1291
1292/** identify attribute values that really contain source code and recursively
1293  * lex them.
1294  * @private
1295  */
1296function PR_splitSourceAttributes(tokens) {
1297  var tokensOut = [];
1298
1299  var sourceChunks = null;
1300  var inSource = false;
1301  var name = '';
1302
1303  for (var ci = 0, nc = tokens.length; ci < nc; ++ci) {
1304    var tok = tokens[ci];
1305    var outList = tokensOut;
1306    if (PR_TAG == tok.style) {
1307      if (inSource) {
1308        inSource = false;
1309        name = '';
1310        if (sourceChunks) {
1311          tokensOut.push(new PR_Token('<span class=embsrc>', null));
1312          var sourceTokens =
1313            PR_lexSource(PR_splitAttributeQuotes(sourceChunks));
1314          for (var si = 0, ns = sourceTokens.length; si < ns; ++si) {
1315            tokensOut.push(sourceTokens[si]);
1316          }
1317          tokensOut.push(new PR_Token('</span>', null));
1318          sourceChunks = null;
1319        }
1320      } else if (name && tok.token.indexOf('=') >= 0) {
1321        var nameLower = name.toLowerCase();
1322        if (PR_startsWith(nameLower, 'on') || 'style' == nameLower) {
1323          inSource = true;
1324        }
1325      } else {
1326        name = '';
1327      }
1328    } else if (PR_ATTRIB_NAME == tok.style) {
1329      name += tok.token;
1330    } else if (PR_ATTRIB_VALUE == tok.style) {
1331      if (inSource) {
1332        if (null == sourceChunks) { sourceChunks = []; }
1333        outList = sourceChunks;
1334        tok = new PR_Token(tok.token, PR_PLAIN);
1335      }
1336    } else {
1337      if (sourceChunks) {
1338        outList = sourceChunks;
1339      }
1340    }
1341    outList.push(tok);
1342  }
1343  return tokensOut;
1344}
1345
1346/** returns a list of PR_Token objects given chunks of source code.
1347  *
1348  * This code treats ", ', and ` as string delimiters, and \ as a string escape.
1349  * It does not recognize perl's qq() style strings.  It has no special handling
1350  * for double delimiter escapes as in basic, or tje tripled delimiters used in
1351  * python, but should work on those regardless although in those cases a single
1352  * string literal may be broken up into multiple adjacent string literals.
1353  *
1354  * It recognizes C, C++, and shell style comments.
1355  *
1356  * @param chunks PR_Tokens with style in (null, PR_PLAIN)
1357  */
1358function PR_lexSource(chunks) {
1359  // split into strings, comments, and other.
1360  // We do this because strings and comments are easily recognizable and can
1361  // contain stuff that looks like other tokens, so we want to mark those early
1362  // so we don't recurse into them.
1363  var tokens = PR_splitStringAndCommentTokens(chunks);
1364
1365  // split non comment|string tokens on whitespace and word boundaries
1366  var tokensOut = [];
1367  for (var i = 0; i < tokens.length; ++i) {
1368    var tok = tokens[i];
1369    if (PR_PLAIN === tok.style) {
1370      PR_splitNonStringNonCommentToken(tok.token, tokensOut);
1371      continue;
1372    }
1373    tokensOut.push(tok);
1374  }
1375
1376  return tokensOut;
1377}
1378
1379/** returns a list of PR_Token objects given a string of markup.
1380  *
1381  * This code assumes that < tokens are html escaped, but " are not.
1382  * It will do a resonable job with <, but will not recognize an &quot;
1383  * as starting a string.
1384  *
1385  * This code recognizes a number of constructs.
1386  * <!-- ... --> comment
1387  * <!\w ... >   declaration
1388  * <\w ... >    tag
1389  * </\w ... >   tag
1390  * <?...?>      embedded source
1391  * &[#\w]...;   entity
1392  *
1393  * It does not recognizes %foo; entities.
1394  *
1395  * It will recurse into any <style>, <script>, and on* attributes using
1396  * PR_lexSource.
1397  */
1398function PR_lexMarkup(chunks) {
1399  // This function works as follows:
1400  // 1) Start by splitting the markup into text and tag chunks
1401  //    Input:  String s
1402  //    Output: List<PR_Token> where style in (PR_PLAIN, null)
1403  // 2) Then split the text chunks further into comments, declarations,
1404  //    tags, etc.
1405  //    After each split, consider whether the token is the start of an
1406  //    embedded source section, i.e. is an open <script> tag.  If it is,
1407  //    find the corresponding close token, and don't bother to lex in between.
1408  //    Input:  List<String>
1409  //    Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null)
1410  // 3) Finally go over each tag token and split out attribute names and values.
1411  //    Input:  List<PR_Token>
1412  //    Output: List<PR_Token> where style in
1413  //            (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null)
1414  var tokensOut = PR_tokenizeMarkup(chunks);
1415  tokensOut = PR_splitTagAttributes(tokensOut);
1416  tokensOut = PR_splitSourceNodes(tokensOut);
1417  tokensOut = PR_splitSourceAttributes(tokensOut);
1418  return tokensOut;
1419}
1420
1421/**
1422 * classify the string as either source or markup and lex appropriately.
1423 * @param {String} html
1424 */
1425function PR_lexOne(html) {
1426//  var chunks = PR_expandTabs(PR_chunkify(html), PR_TAB_WIDTH);
1427  var chunks = PR_chunkify(html);
1428
1429  // treat it as markup if the first non whitespace character is a < and the
1430  // last non-whitespace character is a >
1431  var isMarkup = false;
1432  for (var i = 0; i < chunks.length; ++i) {
1433    if (PR_PLAIN == chunks[i].style) {
1434      if (PR_startsWith(PR_trim(chunks[i].token), '&lt;')) {
1435        for (var j = chunks.length; --j >= 0;) {
1436          if (PR_PLAIN == chunks[j].style) {
1437            isMarkup = PR_endsWith(PR_trim(chunks[j].token), '&gt;');
1438            break;
1439          }
1440        }
1441      }
1442      break;
1443    }
1444  }
1445
1446  return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks);
1447}