1/**
2 * @license
3 * Copyright (C) 2006 Google Inc.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *      http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/**
19 * @fileoverview
20 * some functions for browser-side pretty printing of code contained in html.
21 *
22 * <p>
23 * For a fairly comprehensive set of languages see the
24 * <a href="https://github.com/google/code-prettify#for-which-languages-does-it-work">README</a>
25 * file that came with this source.  At a minimum, the lexer should work on a
26 * number of languages including C and friends, Java, Python, Bash, SQL, HTML,
27 * XML, CSS, Javascript, and Makefiles.  It works passably on Ruby, PHP and Awk
28 * and a subset of Perl, but, because of commenting conventions, doesn't work on
29 * Smalltalk, Lisp-like, or CAML-like languages without an explicit lang class.
30 * <p>
31 * Usage: <ol>
32 * <li> include this source file in an html page via
33 *   {@code <script type="text/javascript" src="/path/to/prettify.js"></script>}
34 * <li> define style rules.  See the example page for examples.
35 * <li> mark the {@code <pre>} and {@code <code>} tags in your source with
36 *    {@code class=prettyprint.}
37 *    You can also use the (html deprecated) {@code <xmp>} tag, but the pretty
38 *    printer needs to do more substantial DOM manipulations to support that, so
39 *    some css styles may not be preserved.
40 * </ol>
41 * That's it.  I wanted to keep the API as simple as possible, so there's no
42 * need to specify which language the code is in, but if you wish, you can add
43 * another class to the {@code <pre>} or {@code <code>} element to specify the
44 * language, as in {@code <pre class="prettyprint lang-java">}.  Any class that
45 * starts with "lang-" followed by a file extension, specifies the file type.
46 * See the "lang-*.js" files in this directory for code that implements
47 * per-language file handlers.
48 * <p>
49 * Change log:<br>
50 * cbeust, 2006/08/22
51 * <blockquote>
52 *   Java annotations (start with "@") are now captured as literals ("lit")
53 * </blockquote>
54 * @requires console
55 */
56
57// JSLint declarations
58/*global console, document, navigator, setTimeout, window, define */
59
60/**
61 * @typedef {!Array.<number|string>}
62 * Alternating indices and the decorations that should be inserted there.
63 * The indices are monotonically increasing.
64 */
65var DecorationsT;
66
67/**
68 * @typedef {!{
69 *   sourceNode: !Element,
70 *   pre: !(number|boolean),
71 *   langExtension: ?string,
72 *   numberLines: ?(number|boolean),
73 *   sourceCode: ?string,
74 *   spans: ?(Array.<number|Node>),
75 *   basePos: ?number,
76 *   decorations: ?DecorationsT
77 * }}
78 * <dl>
79 *  <dt>sourceNode<dd>the element containing the source
80 *  <dt>sourceCode<dd>source as plain text
81 *  <dt>pre<dd>truthy if white-space in text nodes
82 *     should be considered significant.
83 *  <dt>spans<dd> alternating span start indices into source
84 *     and the text node or element (e.g. {@code <BR>}) corresponding to that
85 *     span.
86 *  <dt>decorations<dd>an array of style classes preceded
87 *     by the position at which they start in job.sourceCode in order
88 *  <dt>basePos<dd>integer position of this.sourceCode in the larger chunk of
89 *     source.
90 * </dl>
91 */
92var JobT;
93
94/**
95 * @typedef {!{
96 *   sourceCode: string,
97 *   spans: !(Array.<number|Node>)
98 * }}
99 * <dl>
100 *  <dt>sourceCode<dd>source as plain text
101 *  <dt>spans<dd> alternating span start indices into source
102 *     and the text node or element (e.g. {@code <BR>}) corresponding to that
103 *     span.
104 * </dl>
105 */
106var SourceSpansT;
107
108/** @define {boolean} */
109var IN_GLOBAL_SCOPE = true;
110
111
112/**
113 * {@type !{
114 *   'createSimpleLexer': function (Array, Array): (function (JobT)),
115 *   'registerLangHandler': function (function (JobT), Array.<string>),
116 *   'PR_ATTRIB_NAME': string,
117 *   'PR_ATTRIB_NAME': string,
118 *   'PR_ATTRIB_VALUE': string,
119 *   'PR_COMMENT': string,
120 *   'PR_DECLARATION': string,
121 *   'PR_KEYWORD': string,
122 *   'PR_LITERAL': string,
123 *   'PR_NOCODE': string,
124 *   'PR_PLAIN': string,
125 *   'PR_PUNCTUATION': string,
126 *   'PR_SOURCE': string,
127 *   'PR_STRING': string,
128 *   'PR_TAG': string,
129 *   'PR_TYPE': string,
130 *   'prettyPrintOne': function (string, string, number|boolean),
131 *   'prettyPrint': function (?function, ?(HTMLElement|HTMLDocument))
132 * }}
133 * @const
134 */
135var PR;
136
137/**
138 * Split {@code prettyPrint} into multiple timeouts so as not to interfere with
139 * UI events.
140 * If set to {@code false}, {@code prettyPrint()} is synchronous.
141 */
142var PR_SHOULD_USE_CONTINUATION = true
143if (typeof window !== 'undefined') {
144  window['PR_SHOULD_USE_CONTINUATION'] = PR_SHOULD_USE_CONTINUATION;
145}
146
147/**
148 * Pretty print a chunk of code.
149 * @param {string} sourceCodeHtml The HTML to pretty print.
150 * @param {string} opt_langExtension The language name to use.
151 *     Typically, a filename extension like 'cpp' or 'java'.
152 * @param {number|boolean} opt_numberLines True to number lines,
153 *     or the 1-indexed number of the first line in sourceCodeHtml.
154 * @return {string} code as html, but prettier
155 */
156var prettyPrintOne;
157/**
158 * Find all the {@code <pre>} and {@code <code>} tags in the DOM with
159 * {@code class=prettyprint} and prettify them.
160 *
161 * @param {Function} opt_whenDone called when prettifying is done.
162 * @param {HTMLElement|HTMLDocument} opt_root an element or document
163 *   containing all the elements to pretty print.
164 *   Defaults to {@code document.body}.
165 */
166var prettyPrint;
167
168
169(function () {
170  var win = (typeof window !== 'undefined') ? window : {};
171  // Keyword lists for various languages.
172  // We use things that coerce to strings to make them compact when minified
173  // and to defeat aggressive optimizers that fold large string constants.
174  var FLOW_CONTROL_KEYWORDS = ["break,continue,do,else,for,if,return,while"];
175  var C_KEYWORDS = [FLOW_CONTROL_KEYWORDS,"auto,case,char,const,default," +
176      "double,enum,extern,float,goto,inline,int,long,register,restrict,short,signed," +
177      "sizeof,static,struct,switch,typedef,union,unsigned,void,volatile"];
178  var COMMON_KEYWORDS = [C_KEYWORDS,"catch,class,delete,false,import," +
179      "new,operator,private,protected,public,this,throw,true,try,typeof"];
180  var CPP_KEYWORDS = [COMMON_KEYWORDS,"alignas,alignof,align_union,asm,axiom,bool," +
181      "concept,concept_map,const_cast,constexpr,decltype,delegate," +
182      "dynamic_cast,explicit,export,friend,generic,late_check," +
183      "mutable,namespace,noexcept,noreturn,nullptr,property,reinterpret_cast,static_assert," +
184      "static_cast,template,typeid,typename,using,virtual,where"];
185  var JAVA_KEYWORDS = [COMMON_KEYWORDS,
186      "abstract,assert,boolean,byte,extends,finally,final,implements,import," +
187      "instanceof,interface,null,native,package,strictfp,super,synchronized," +
188      "throws,transient"];
189  var CSHARP_KEYWORDS = [COMMON_KEYWORDS,
190      "abstract,add,alias,as,ascending,async,await,base,bool,by,byte,checked,decimal,delegate,descending," +
191      "dynamic,event,finally,fixed,foreach,from,get,global,group,implicit,in,interface," +
192      "internal,into,is,join,let,lock,null,object,out,override,orderby,params," +
193      "partial,readonly,ref,remove,sbyte,sealed,select,set,stackalloc,string,select,uint,ulong," +
194      "unchecked,unsafe,ushort,value,var,virtual,where,yield"];
195  var COFFEE_KEYWORDS = "all,and,by,catch,class,else,extends,false,finally," +
196      "for,if,in,is,isnt,loop,new,no,not,null,of,off,on,or,return,super,then," +
197      "throw,true,try,unless,until,when,while,yes";
198  var JSCRIPT_KEYWORDS = [COMMON_KEYWORDS,
199      "abstract,async,await,constructor,debugger,enum,eval,export,from,function," +
200      "get,import,implements,instanceof,interface,let,null,of,set,undefined," +
201      "var,with,yield,Infinity,NaN"];
202  var PERL_KEYWORDS = "caller,delete,die,do,dump,elsif,eval,exit,foreach,for," +
203      "goto,if,import,last,local,my,next,no,our,print,package,redo,require," +
204      "sub,undef,unless,until,use,wantarray,while,BEGIN,END";
205  var PYTHON_KEYWORDS = [FLOW_CONTROL_KEYWORDS, "and,as,assert,class,def,del," +
206      "elif,except,exec,finally,from,global,import,in,is,lambda," +
207      "nonlocal,not,or,pass,print,raise,try,with,yield," +
208      "False,True,None"];
209  var RUBY_KEYWORDS = [FLOW_CONTROL_KEYWORDS, "alias,and,begin,case,class," +
210      "def,defined,elsif,end,ensure,false,in,module,next,nil,not,or,redo," +
211      "rescue,retry,self,super,then,true,undef,unless,until,when,yield," +
212      "BEGIN,END"];
213  var SH_KEYWORDS = [FLOW_CONTROL_KEYWORDS, "case,done,elif,esac,eval,fi," +
214      "function,in,local,set,then,until"];
215  var ALL_KEYWORDS = [
216      CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS, JSCRIPT_KEYWORDS,
217      PERL_KEYWORDS, PYTHON_KEYWORDS, RUBY_KEYWORDS, SH_KEYWORDS];
218  var C_TYPES = /^(DIR|FILE|array|vector|(de|priority_)?queue|(forward_)?list|stack|(const_)?(reverse_)?iterator|(unordered_)?(multi)?(set|map)|bitset|u?(int|float)\d*)\b/;
219
220  // token style names.  correspond to css classes
221  /**
222   * token style for a string literal
223   * @const
224   */
225  var PR_STRING = 'str';
226  /**
227   * token style for a keyword
228   * @const
229   */
230  var PR_KEYWORD = 'kwd';
231  /**
232   * token style for a comment
233   * @const
234   */
235  var PR_COMMENT = 'com';
236  /**
237   * token style for a type
238   * @const
239   */
240  var PR_TYPE = 'typ';
241  /**
242   * token style for a literal value.  e.g. 1, null, true.
243   * @const
244   */
245  var PR_LITERAL = 'lit';
246  /**
247   * token style for a punctuation string.
248   * @const
249   */
250  var PR_PUNCTUATION = 'pun';
251  /**
252   * token style for plain text.
253   * @const
254   */
255  var PR_PLAIN = 'pln';
256
257  /**
258   * token style for an sgml tag.
259   * @const
260   */
261  var PR_TAG = 'tag';
262  /**
263   * token style for a markup declaration such as a DOCTYPE.
264   * @const
265   */
266  var PR_DECLARATION = 'dec';
267  /**
268   * token style for embedded source.
269   * @const
270   */
271  var PR_SOURCE = 'src';
272  /**
273   * token style for an sgml attribute name.
274   * @const
275   */
276  var PR_ATTRIB_NAME = 'atn';
277  /**
278   * token style for an sgml attribute value.
279   * @const
280   */
281  var PR_ATTRIB_VALUE = 'atv';
282
283  /**
284   * A class that indicates a section of markup that is not code, e.g. to allow
285   * embedding of line numbers within code listings.
286   * @const
287   */
288  var PR_NOCODE = 'nocode';
289
290
291  // Regex pattern below is automatically generated by regexpPrecederPatterns.pl
292  // Do not modify, your changes will be erased.
293
294  // CAVEAT: this does not properly handle the case where a regular
295  // expression immediately follows another since a regular expression may
296  // have flags for case-sensitivity and the like.  Having regexp tokens
297  // adjacent is not valid in any language I'm aware of, so I'm punting.
298  // TODO: maybe style special characters inside a regexp as punctuation.
299
300  /**
301   * A set of tokens that can precede a regular expression literal in
302   * javascript
303   * http://web.archive.org/web/20070717142515/http://www.mozilla.org/js/language/js20/rationale/syntax.html
304   * has the full list, but I've removed ones that might be problematic when
305   * seen in languages that don't support regular expression literals.
306   *
307   * Specifically, I've removed any keywords that can't precede a regexp
308   * literal in a syntactically legal javascript program, and I've removed the
309   * "in" keyword since it's not a keyword in many languages, and might be used
310   * as a count of inches.
311   *
312   * The link above does not accurately describe EcmaScript rules since
313   * it fails to distinguish between (a=++/b/i) and (a++/b/i) but it works
314   * very well in practice.
315   *
316   * @private
317   * @const
318   */
319  var REGEXP_PRECEDER_PATTERN = '(?:^^\\.?|[+-]|[!=]=?=?|\\#|%=?|&&?=?|\\(|\\*=?|[+\\-]=|->|\\/=?|::?|<<?=?|>>?>?=?|,|;|\\?|@|\\[|~|{|\\^\\^?=?|\\|\\|?=?|break|case|continue|delete|do|else|finally|instanceof|return|throw|try|typeof)\\s*';
320
321
322  /**
323   * Given a group of {@link RegExp}s, returns a {@code RegExp} that globally
324   * matches the union of the sets of strings matched by the input RegExp.
325   * Since it matches globally, if the input strings have a start-of-input
326   * anchor (/^.../), it is ignored for the purposes of unioning.
327   * @param {Array.<RegExp>} regexs non multiline, non-global regexs.
328   * @return {RegExp} a global regex.
329   */
330  function combinePrefixPatterns(regexs) {
331    var capturedGroupIndex = 0;
332
333    var needToFoldCase = false;
334    var ignoreCase = false;
335    for (var i = 0, n = regexs.length; i < n; ++i) {
336      var regex = regexs[i];
337      if (regex.ignoreCase) {
338        ignoreCase = true;
339      } else if (/[a-z]/i.test(regex.source.replace(
340                     /\\u[0-9a-f]{4}|\\x[0-9a-f]{2}|\\[^ux]/gi, ''))) {
341        needToFoldCase = true;
342        ignoreCase = false;
343        break;
344      }
345    }
346
347    var escapeCharToCodeUnit = {
348      'b': 8,
349      't': 9,
350      'n': 0xa,
351      'v': 0xb,
352      'f': 0xc,
353      'r': 0xd
354    };
355
356    function decodeEscape(charsetPart) {
357      var cc0 = charsetPart.charCodeAt(0);
358      if (cc0 !== 92 /* \\ */) {
359        return cc0;
360      }
361      var c1 = charsetPart.charAt(1);
362      cc0 = escapeCharToCodeUnit[c1];
363      if (cc0) {
364        return cc0;
365      } else if ('0' <= c1 && c1 <= '7') {
366        return parseInt(charsetPart.substring(1), 8);
367      } else if (c1 === 'u' || c1 === 'x') {
368        return parseInt(charsetPart.substring(2), 16);
369      } else {
370        return charsetPart.charCodeAt(1);
371      }
372    }
373
374    function encodeEscape(charCode) {
375      if (charCode < 0x20) {
376        return (charCode < 0x10 ? '\\x0' : '\\x') + charCode.toString(16);
377      }
378      var ch = String.fromCharCode(charCode);
379      return (ch === '\\' || ch === '-' || ch === ']' || ch === '^')
380          ? "\\" + ch : ch;
381    }
382
383    function caseFoldCharset(charSet) {
384      var charsetParts = charSet.substring(1, charSet.length - 1).match(
385          new RegExp(
386              '\\\\u[0-9A-Fa-f]{4}'
387              + '|\\\\x[0-9A-Fa-f]{2}'
388              + '|\\\\[0-3][0-7]{0,2}'
389              + '|\\\\[0-7]{1,2}'
390              + '|\\\\[\\s\\S]'
391              + '|-'
392              + '|[^-\\\\]',
393              'g'));
394      var ranges = [];
395      var inverse = charsetParts[0] === '^';
396
397      var out = ['['];
398      if (inverse) { out.push('^'); }
399
400      for (var i = inverse ? 1 : 0, n = charsetParts.length; i < n; ++i) {
401        var p = charsetParts[i];
402        if (/\\[bdsw]/i.test(p)) {  // Don't muck with named groups.
403          out.push(p);
404        } else {
405          var start = decodeEscape(p);
406          var end;
407          if (i + 2 < n && '-' === charsetParts[i + 1]) {
408            end = decodeEscape(charsetParts[i + 2]);
409            i += 2;
410          } else {
411            end = start;
412          }
413          ranges.push([start, end]);
414          // If the range might intersect letters, then expand it.
415          // This case handling is too simplistic.
416          // It does not deal with non-latin case folding.
417          // It works for latin source code identifiers though.
418          if (!(end < 65 || start > 122)) {
419            if (!(end < 65 || start > 90)) {
420              ranges.push([Math.max(65, start) | 32, Math.min(end, 90) | 32]);
421            }
422            if (!(end < 97 || start > 122)) {
423              ranges.push([Math.max(97, start) & ~32, Math.min(end, 122) & ~32]);
424            }
425          }
426        }
427      }
428
429      // [[1, 10], [3, 4], [8, 12], [14, 14], [16, 16], [17, 17]]
430      // -> [[1, 12], [14, 14], [16, 17]]
431      ranges.sort(function (a, b) { return (a[0] - b[0]) || (b[1]  - a[1]); });
432      var consolidatedRanges = [];
433      var lastRange = [];
434      for (var i = 0; i < ranges.length; ++i) {
435        var range = ranges[i];
436        if (range[0] <= lastRange[1] + 1) {
437          lastRange[1] = Math.max(lastRange[1], range[1]);
438        } else {
439          consolidatedRanges.push(lastRange = range);
440        }
441      }
442
443      for (var i = 0; i < consolidatedRanges.length; ++i) {
444        var range = consolidatedRanges[i];
445        out.push(encodeEscape(range[0]));
446        if (range[1] > range[0]) {
447          if (range[1] + 1 > range[0]) { out.push('-'); }
448          out.push(encodeEscape(range[1]));
449        }
450      }
451      out.push(']');
452      return out.join('');
453    }
454
455    function allowAnywhereFoldCaseAndRenumberGroups(regex) {
456      // Split into character sets, escape sequences, punctuation strings
457      // like ('(', '(?:', ')', '^'), and runs of characters that do not
458      // include any of the above.
459      var parts = regex.source.match(
460          new RegExp(
461              '(?:'
462              + '\\[(?:[^\\x5C\\x5D]|\\\\[\\s\\S])*\\]'  // a character set
463              + '|\\\\u[A-Fa-f0-9]{4}'  // a unicode escape
464              + '|\\\\x[A-Fa-f0-9]{2}'  // a hex escape
465              + '|\\\\[0-9]+'  // a back-reference or octal escape
466              + '|\\\\[^ux0-9]'  // other escape sequence
467              + '|\\(\\?[:!=]'  // start of a non-capturing group
468              + '|[\\(\\)\\^]'  // start/end of a group, or line start
469              + '|[^\\x5B\\x5C\\(\\)\\^]+'  // run of other characters
470              + ')',
471              'g'));
472      var n = parts.length;
473
474      // Maps captured group numbers to the number they will occupy in
475      // the output or to -1 if that has not been determined, or to
476      // undefined if they need not be capturing in the output.
477      var capturedGroups = [];
478
479      // Walk over and identify back references to build the capturedGroups
480      // mapping.
481      for (var i = 0, groupIndex = 0; i < n; ++i) {
482        var p = parts[i];
483        if (p === '(') {
484          // groups are 1-indexed, so max group index is count of '('
485          ++groupIndex;
486        } else if ('\\' === p.charAt(0)) {
487          var decimalValue = +p.substring(1);
488          if (decimalValue) {
489            if (decimalValue <= groupIndex) {
490              capturedGroups[decimalValue] = -1;
491            } else {
492              // Replace with an unambiguous escape sequence so that
493              // an octal escape sequence does not turn into a backreference
494              // to a capturing group from an earlier regex.
495              parts[i] = encodeEscape(decimalValue);
496            }
497          }
498        }
499      }
500
501      // Renumber groups and reduce capturing groups to non-capturing groups
502      // where possible.
503      for (var i = 1; i < capturedGroups.length; ++i) {
504        if (-1 === capturedGroups[i]) {
505          capturedGroups[i] = ++capturedGroupIndex;
506        }
507      }
508      for (var i = 0, groupIndex = 0; i < n; ++i) {
509        var p = parts[i];
510        if (p === '(') {
511          ++groupIndex;
512          if (!capturedGroups[groupIndex]) {
513            parts[i] = '(?:';
514          }
515        } else if ('\\' === p.charAt(0)) {
516          var decimalValue = +p.substring(1);
517          if (decimalValue && decimalValue <= groupIndex) {
518            parts[i] = '\\' + capturedGroups[decimalValue];
519          }
520        }
521      }
522
523      // Remove any prefix anchors so that the output will match anywhere.
524      // ^^ really does mean an anchored match though.
525      for (var i = 0; i < n; ++i) {
526        if ('^' === parts[i] && '^' !== parts[i + 1]) { parts[i] = ''; }
527      }
528
529      // Expand letters to groups to handle mixing of case-sensitive and
530      // case-insensitive patterns if necessary.
531      if (regex.ignoreCase && needToFoldCase) {
532        for (var i = 0; i < n; ++i) {
533          var p = parts[i];
534          var ch0 = p.charAt(0);
535          if (p.length >= 2 && ch0 === '[') {
536            parts[i] = caseFoldCharset(p);
537          } else if (ch0 !== '\\') {
538            // TODO: handle letters in numeric escapes.
539            parts[i] = p.replace(
540                /[a-zA-Z]/g,
541                function (ch) {
542                  var cc = ch.charCodeAt(0);
543                  return '[' + String.fromCharCode(cc & ~32, cc | 32) + ']';
544                });
545          }
546        }
547      }
548
549      return parts.join('');
550    }
551
552    var rewritten = [];
553    for (var i = 0, n = regexs.length; i < n; ++i) {
554      var regex = regexs[i];
555      if (regex.global || regex.multiline) { throw new Error('' + regex); }
556      rewritten.push(
557          '(?:' + allowAnywhereFoldCaseAndRenumberGroups(regex) + ')');
558    }
559
560    return new RegExp(rewritten.join('|'), ignoreCase ? 'gi' : 'g');
561  }
562
563
564  /**
565   * Split markup into a string of source code and an array mapping ranges in
566   * that string to the text nodes in which they appear.
567   *
568   * <p>
569   * The HTML DOM structure:</p>
570   * <pre>
571   * (Element   "p"
572   *   (Element "b"
573   *     (Text  "print "))       ; #1
574   *   (Text    "'Hello '")      ; #2
575   *   (Element "br")            ; #3
576   *   (Text    "  + 'World';")) ; #4
577   * </pre>
578   * <p>
579   * corresponds to the HTML
580   * {@code <p><b>print </b>'Hello '<br>  + 'World';</p>}.</p>
581   *
582   * <p>
583   * It will produce the output:</p>
584   * <pre>
585   * {
586   *   sourceCode: "print 'Hello '\n  + 'World';",
587   *   //                     1          2
588   *   //           012345678901234 5678901234567
589   *   spans: [0, #1, 6, #2, 14, #3, 15, #4]
590   * }
591   * </pre>
592   * <p>
593   * where #1 is a reference to the {@code "print "} text node above, and so
594   * on for the other text nodes.
595   * </p>
596   *
597   * <p>
598   * The {@code} spans array is an array of pairs.  Even elements are the start
599   * indices of substrings, and odd elements are the text nodes (or BR elements)
600   * that contain the text for those substrings.
601   * Substrings continue until the next index or the end of the source.
602   * </p>
603   *
604   * @param {Node} node an HTML DOM subtree containing source-code.
605   * @param {boolean|number} isPreformatted truthy if white-space in
606   *    text nodes should be considered significant.
607   * @return {SourceSpansT} source code and the nodes in which they occur.
608   */
609  function extractSourceSpans(node, isPreformatted) {
610    var nocode = /(?:^|\s)nocode(?:\s|$)/;
611
612    var chunks = [];
613    var length = 0;
614    var spans = [];
615    var k = 0;
616
617    function walk(node) {
618      var type = node.nodeType;
619      if (type == 1) {  // Element
620        if (nocode.test(node.className)) { return; }
621        for (var child = node.firstChild; child; child = child.nextSibling) {
622          walk(child);
623        }
624        var nodeName = node.nodeName.toLowerCase();
625        if ('br' === nodeName || 'li' === nodeName) {
626          chunks[k] = '\n';
627          spans[k << 1] = length++;
628          spans[(k++ << 1) | 1] = node;
629        }
630      } else if (type == 3 || type == 4) {  // Text
631        var text = node.nodeValue;
632        if (text.length) {
633          if (!isPreformatted) {
634            text = text.replace(/[ \t\r\n]+/g, ' ');
635          } else {
636            text = text.replace(/\r\n?/g, '\n');  // Normalize newlines.
637          }
638          // TODO: handle tabs here?
639          chunks[k] = text;
640          spans[k << 1] = length;
641          length += text.length;
642          spans[(k++ << 1) | 1] = node;
643        }
644      }
645    }
646
647    walk(node);
648
649    return {
650      sourceCode: chunks.join('').replace(/\n$/, ''),
651      spans: spans
652    };
653  }
654
655
656  /**
657   * Apply the given language handler to sourceCode and add the resulting
658   * decorations to out.
659   * @param {!Element} sourceNode
660   * @param {number} basePos the index of sourceCode within the chunk of source
661   *    whose decorations are already present on out.
662   * @param {string} sourceCode
663   * @param {function(JobT)} langHandler
664   * @param {DecorationsT} out
665   */
666  function appendDecorations(
667      sourceNode, basePos, sourceCode, langHandler, out) {
668    if (!sourceCode) { return; }
669    /** @type {JobT} */
670    var job = {
671      sourceNode: sourceNode,
672      pre: 1,
673      langExtension: null,
674      numberLines: null,
675      sourceCode: sourceCode,
676      spans: null,
677      basePos: basePos,
678      decorations: null
679    };
680    langHandler(job);
681    out.push.apply(out, job.decorations);
682  }
683
684  var notWs = /\S/;
685
686  /**
687   * Given an element, if it contains only one child element and any text nodes
688   * it contains contain only space characters, return the sole child element.
689   * Otherwise returns undefined.
690   * <p>
691   * This is meant to return the CODE element in {@code <pre><code ...>} when
692   * there is a single child element that contains all the non-space textual
693   * content, but not to return anything where there are multiple child elements
694   * as in {@code <pre><code>...</code><code>...</code></pre>} or when there
695   * is textual content.
696   */
697  function childContentWrapper(element) {
698    var wrapper = undefined;
699    for (var c = element.firstChild; c; c = c.nextSibling) {
700      var type = c.nodeType;
701      wrapper = (type === 1)  // Element Node
702          ? (wrapper ? element : c)
703          : (type === 3)  // Text Node
704          ? (notWs.test(c.nodeValue) ? element : wrapper)
705          : wrapper;
706    }
707    return wrapper === element ? undefined : wrapper;
708  }
709
710  /** Given triples of [style, pattern, context] returns a lexing function,
711    * The lexing function interprets the patterns to find token boundaries and
712    * returns a decoration list of the form
713    * [index_0, style_0, index_1, style_1, ..., index_n, style_n]
714    * where index_n is an index into the sourceCode, and style_n is a style
715    * constant like PR_PLAIN.  index_n-1 <= index_n, and style_n-1 applies to
716    * all characters in sourceCode[index_n-1:index_n].
717    *
718    * The stylePatterns is a list whose elements have the form
719    * [style : string, pattern : RegExp, DEPRECATED, shortcut : string].
720    *
721    * Style is a style constant like PR_PLAIN, or can be a string of the
722    * form 'lang-FOO', where FOO is a language extension describing the
723    * language of the portion of the token in $1 after pattern executes.
724    * E.g., if style is 'lang-lisp', and group 1 contains the text
725    * '(hello (world))', then that portion of the token will be passed to the
726    * registered lisp handler for formatting.
727    * The text before and after group 1 will be restyled using this decorator
728    * so decorators should take care that this doesn't result in infinite
729    * recursion.  For example, the HTML lexer rule for SCRIPT elements looks
730    * something like ['lang-js', /<[s]cript>(.+?)<\/script>/].  This may match
731    * '<script>foo()<\/script>', which would cause the current decorator to
732    * be called with '<script>' which would not match the same rule since
733    * group 1 must not be empty, so it would be instead styled as PR_TAG by
734    * the generic tag rule.  The handler registered for the 'js' extension would
735    * then be called with 'foo()', and finally, the current decorator would
736    * be called with '<\/script>' which would not match the original rule and
737    * so the generic tag rule would identify it as a tag.
738    *
739    * Pattern must only match prefixes, and if it matches a prefix, then that
740    * match is considered a token with the same style.
741    *
742    * Context is applied to the last non-whitespace, non-comment token
743    * recognized.
744    *
745    * Shortcut is an optional string of characters, any of which, if the first
746    * character, gurantee that this pattern and only this pattern matches.
747    *
748    * @param {Array} shortcutStylePatterns patterns that always start with
749    *   a known character.  Must have a shortcut string.
750    * @param {Array} fallthroughStylePatterns patterns that will be tried in
751    *   order if the shortcut ones fail.  May have shortcuts.
752    *
753    * @return {function (JobT)} a function that takes an undecorated job and
754    *   attaches a list of decorations.
755    */
756  function createSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns) {
757    var shortcuts = {};
758    var tokenizer;
759    (function () {
760      var allPatterns = shortcutStylePatterns.concat(fallthroughStylePatterns);
761      var allRegexs = [];
762      var regexKeys = {};
763      for (var i = 0, n = allPatterns.length; i < n; ++i) {
764        var patternParts = allPatterns[i];
765        var shortcutChars = patternParts[3];
766        if (shortcutChars) {
767          for (var c = shortcutChars.length; --c >= 0;) {
768            shortcuts[shortcutChars.charAt(c)] = patternParts;
769          }
770        }
771        var regex = patternParts[1];
772        var k = '' + regex;
773        if (!regexKeys.hasOwnProperty(k)) {
774          allRegexs.push(regex);
775          regexKeys[k] = null;
776        }
777      }
778      allRegexs.push(/[\0-\uffff]/);
779      tokenizer = combinePrefixPatterns(allRegexs);
780    })();
781
782    var nPatterns = fallthroughStylePatterns.length;
783
784    /**
785     * Lexes job.sourceCode and attaches an output array job.decorations of
786     * style classes preceded by the position at which they start in
787     * job.sourceCode in order.
788     *
789     * @type{function (JobT)}
790     */
791    var decorate = function (job) {
792      var sourceCode = job.sourceCode, basePos = job.basePos;
793      var sourceNode = job.sourceNode;
794      /** Even entries are positions in source in ascending order.  Odd enties
795        * are style markers (e.g., PR_COMMENT) that run from that position until
796        * the end.
797        * @type {DecorationsT}
798        */
799      var decorations = [basePos, PR_PLAIN];
800      var pos = 0;  // index into sourceCode
801      var tokens = sourceCode.match(tokenizer) || [];
802      var styleCache = {};
803
804      for (var ti = 0, nTokens = tokens.length; ti < nTokens; ++ti) {
805        var token = tokens[ti];
806        var style = styleCache[token];
807        var match = void 0;
808
809        var isEmbedded;
810        if (typeof style === 'string') {
811          isEmbedded = false;
812        } else {
813          var patternParts = shortcuts[token.charAt(0)];
814          if (patternParts) {
815            match = token.match(patternParts[1]);
816            style = patternParts[0];
817          } else {
818            for (var i = 0; i < nPatterns; ++i) {
819              patternParts = fallthroughStylePatterns[i];
820              match = token.match(patternParts[1]);
821              if (match) {
822                style = patternParts[0];
823                break;
824              }
825            }
826
827            if (!match) {  // make sure that we make progress
828              style = PR_PLAIN;
829            }
830          }
831
832          isEmbedded = style.length >= 5 && 'lang-' === style.substring(0, 5);
833          if (isEmbedded && !(match && typeof match[1] === 'string')) {
834            isEmbedded = false;
835            style = PR_SOURCE;
836          }
837
838          if (!isEmbedded) { styleCache[token] = style; }
839        }
840
841        var tokenStart = pos;
842        pos += token.length;
843
844        if (!isEmbedded) {
845          decorations.push(basePos + tokenStart, style);
846        } else {  // Treat group 1 as an embedded block of source code.
847          var embeddedSource = match[1];
848          var embeddedSourceStart = token.indexOf(embeddedSource);
849          var embeddedSourceEnd = embeddedSourceStart + embeddedSource.length;
850          if (match[2]) {
851            // If embeddedSource can be blank, then it would match at the
852            // beginning which would cause us to infinitely recurse on the
853            // entire token, so we catch the right context in match[2].
854            embeddedSourceEnd = token.length - match[2].length;
855            embeddedSourceStart = embeddedSourceEnd - embeddedSource.length;
856          }
857          var lang = style.substring(5);
858          // Decorate the left of the embedded source
859          appendDecorations(
860              sourceNode,
861              basePos + tokenStart,
862              token.substring(0, embeddedSourceStart),
863              decorate, decorations);
864          // Decorate the embedded source
865          appendDecorations(
866              sourceNode,
867              basePos + tokenStart + embeddedSourceStart,
868              embeddedSource,
869              langHandlerForExtension(lang, embeddedSource),
870              decorations);
871          // Decorate the right of the embedded section
872          appendDecorations(
873              sourceNode,
874              basePos + tokenStart + embeddedSourceEnd,
875              token.substring(embeddedSourceEnd),
876              decorate, decorations);
877        }
878      }
879      job.decorations = decorations;
880    };
881    return decorate;
882  }
883
884  /** returns a function that produces a list of decorations from source text.
885    *
886    * This code treats ", ', and ` as string delimiters, and \ as a string
887    * escape.  It does not recognize perl's qq() style strings.
888    * It has no special handling for double delimiter escapes as in basic, or
889    * the tripled delimiters used in python, but should work on those regardless
890    * although in those cases a single string literal may be broken up into
891    * multiple adjacent string literals.
892    *
893    * It recognizes C, C++, and shell style comments.
894    *
895    * @param {Object} options a set of optional parameters.
896    * @return {function (JobT)} a function that examines the source code
897    *     in the input job and builds a decoration list which it attaches to
898    *     the job.
899    */
900  function sourceDecorator(options) {
901    var shortcutStylePatterns = [], fallthroughStylePatterns = [];
902    if (options['tripleQuotedStrings']) {
903      // '''multi-line-string''', 'single-line-string', and double-quoted
904      shortcutStylePatterns.push(
905          [PR_STRING,  /^(?:\'\'\'(?:[^\'\\]|\\[\s\S]|\'{1,2}(?=[^\']))*(?:\'\'\'|$)|\"\"\"(?:[^\"\\]|\\[\s\S]|\"{1,2}(?=[^\"]))*(?:\"\"\"|$)|\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$))/,
906           null, '\'"']);
907    } else if (options['multiLineStrings']) {
908      // 'multi-line-string', "multi-line-string"
909      shortcutStylePatterns.push(
910          [PR_STRING,  /^(?:\'(?:[^\\\']|\\[\s\S])*(?:\'|$)|\"(?:[^\\\"]|\\[\s\S])*(?:\"|$)|\`(?:[^\\\`]|\\[\s\S])*(?:\`|$))/,
911           null, '\'"`']);
912    } else {
913      // 'single-line-string', "single-line-string"
914      shortcutStylePatterns.push(
915          [PR_STRING,
916           /^(?:\'(?:[^\\\'\r\n]|\\.)*(?:\'|$)|\"(?:[^\\\"\r\n]|\\.)*(?:\"|$))/,
917           null, '"\'']);
918    }
919    if (options['verbatimStrings']) {
920      // verbatim-string-literal production from the C# grammar.  See issue 93.
921      fallthroughStylePatterns.push(
922          [PR_STRING, /^@\"(?:[^\"]|\"\")*(?:\"|$)/, null]);
923    }
924    var hc = options['hashComments'];
925    if (hc) {
926      if (options['cStyleComments']) {
927        if (hc > 1) {  // multiline hash comments
928          shortcutStylePatterns.push(
929              [PR_COMMENT, /^#(?:##(?:[^#]|#(?!##))*(?:###|$)|.*)/, null, '#']);
930        } else {
931          // Stop C preprocessor declarations at an unclosed open comment
932          shortcutStylePatterns.push(
933              [PR_COMMENT, /^#(?:(?:define|e(?:l|nd)if|else|error|ifn?def|include|line|pragma|undef|warning)\b|[^\r\n]*)/,
934               null, '#']);
935        }
936        // #include <stdio.h>
937        fallthroughStylePatterns.push(
938            [PR_STRING,
939             /^<(?:(?:(?:\.\.\/)*|\/?)(?:[\w-]+(?:\/[\w-]+)+)?[\w-]+\.h(?:h|pp|\+\+)?|[a-z]\w*)>/,
940             null]);
941      } else {
942        shortcutStylePatterns.push([PR_COMMENT, /^#[^\r\n]*/, null, '#']);
943      }
944    }
945    if (options['cStyleComments']) {
946      fallthroughStylePatterns.push([PR_COMMENT, /^\/\/[^\r\n]*/, null]);
947      fallthroughStylePatterns.push(
948          [PR_COMMENT, /^\/\*[\s\S]*?(?:\*\/|$)/, null]);
949    }
950    var regexLiterals = options['regexLiterals'];
951    if (regexLiterals) {
952      /**
953       * @const
954       */
955      var regexExcls = regexLiterals > 1
956        ? ''  // Multiline regex literals
957        : '\n\r';
958      /**
959       * @const
960       */
961      var regexAny = regexExcls ? '.' : '[\\S\\s]';
962      /**
963       * @const
964       */
965      var REGEX_LITERAL = (
966          // A regular expression literal starts with a slash that is
967          // not followed by * or / so that it is not confused with
968          // comments.
969          '/(?=[^/*' + regexExcls + '])'
970          // and then contains any number of raw characters,
971          + '(?:[^/\\x5B\\x5C' + regexExcls + ']'
972          // escape sequences (\x5C),
973          +    '|\\x5C' + regexAny
974          // or non-nesting character sets (\x5B\x5D);
975          +    '|\\x5B(?:[^\\x5C\\x5D' + regexExcls + ']'
976          +             '|\\x5C' + regexAny + ')*(?:\\x5D|$))+'
977          // finally closed by a /.
978          + '/');
979      fallthroughStylePatterns.push(
980          ['lang-regex',
981           RegExp('^' + REGEXP_PRECEDER_PATTERN + '(' + REGEX_LITERAL + ')')
982           ]);
983    }
984
985    var types = options['types'];
986    if (types) {
987      fallthroughStylePatterns.push([PR_TYPE, types]);
988    }
989
990    var keywords = ("" + options['keywords']).replace(/^ | $/g, '');
991    if (keywords.length) {
992      fallthroughStylePatterns.push(
993          [PR_KEYWORD,
994           new RegExp('^(?:' + keywords.replace(/[\s,]+/g, '|') + ')\\b'),
995           null]);
996    }
997
998    shortcutStylePatterns.push([PR_PLAIN,       /^\s+/, null, ' \r\n\t\xA0']);
999
1000    var punctuation =
1001      // The Bash man page says
1002
1003      // A word is a sequence of characters considered as a single
1004      // unit by GRUB. Words are separated by metacharacters,
1005      // which are the following plus space, tab, and newline: { }
1006      // | & $ ; < >
1007      // ...
1008
1009      // A word beginning with # causes that word and all remaining
1010      // characters on that line to be ignored.
1011
1012      // which means that only a '#' after /(?:^|[{}|&$;<>\s])/ starts a
1013      // comment but empirically
1014      // $ echo {#}
1015      // {#}
1016      // $ echo \$#
1017      // $#
1018      // $ echo }#
1019      // }#
1020
1021      // so /(?:^|[|&;<>\s])/ is more appropriate.
1022
1023      // http://gcc.gnu.org/onlinedocs/gcc-2.95.3/cpp_1.html#SEC3
1024      // suggests that this definition is compatible with a
1025      // default mode that tries to use a single token definition
1026      // to recognize both bash/python style comments and C
1027      // preprocessor directives.
1028
1029      // This definition of punctuation does not include # in the list of
1030      // follow-on exclusions, so # will not be broken before if preceeded
1031      // by a punctuation character.  We could try to exclude # after
1032      // [|&;<>] but that doesn't seem to cause many major problems.
1033      // If that does turn out to be a problem, we should change the below
1034      // when hc is truthy to include # in the run of punctuation characters
1035      // only when not followint [|&;<>].
1036      '^.[^\\s\\w.$@\'"`/\\\\]*';
1037    if (options['regexLiterals']) {
1038      punctuation += '(?!\s*\/)';
1039    }
1040
1041    fallthroughStylePatterns.push(
1042        // TODO(mikesamuel): recognize non-latin letters and numerals in idents
1043        [PR_LITERAL,     /^@[a-z_$][a-z_$@0-9]*/i, null],
1044        [PR_TYPE,        /^(?:[@_]?[A-Z]+[a-z][A-Za-z_$@0-9]*|\w+_t\b)/, null],
1045        [PR_PLAIN,       /^[a-z_$][a-z_$@0-9]*/i, null],
1046        [PR_LITERAL,
1047         new RegExp(
1048             '^(?:'
1049             // A hex number
1050             + '0x[a-f0-9]+'
1051             // or an octal or decimal number,
1052             + '|(?:\\d(?:_\\d+)*\\d*(?:\\.\\d*)?|\\.\\d\\+)'
1053             // possibly in scientific notation
1054             + '(?:e[+\\-]?\\d+)?'
1055             + ')'
1056             // with an optional modifier like UL for unsigned long
1057             + '[a-z]*', 'i'),
1058         null, '0123456789'],
1059        // Don't treat escaped quotes in bash as starting strings.
1060        // See issue 144.
1061        [PR_PLAIN,       /^\\[\s\S]?/, null],
1062        [PR_PUNCTUATION, new RegExp(punctuation), null]);
1063
1064    return createSimpleLexer(shortcutStylePatterns, fallthroughStylePatterns);
1065  }
1066
1067  var decorateSource = sourceDecorator({
1068        'keywords': ALL_KEYWORDS,
1069        'hashComments': true,
1070        'cStyleComments': true,
1071        'multiLineStrings': true,
1072        'regexLiterals': true
1073      });
1074
1075  /**
1076   * Given a DOM subtree, wraps it in a list, and puts each line into its own
1077   * list item.
1078   *
1079   * @param {Node} node modified in place.  Its content is pulled into an
1080   *     HTMLOListElement, and each line is moved into a separate list item.
1081   *     This requires cloning elements, so the input might not have unique
1082   *     IDs after numbering.
1083   * @param {number|null|boolean} startLineNum
1084   *     If truthy, coerced to an integer which is the 1-indexed line number
1085   *     of the first line of code.  The number of the first line will be
1086   *     attached to the list.
1087   * @param {boolean} isPreformatted true iff white-space in text nodes should
1088   *     be treated as significant.
1089   */
1090  function numberLines(node, startLineNum, isPreformatted) {
1091    var nocode = /(?:^|\s)nocode(?:\s|$)/;
1092    var lineBreak = /\r\n?|\n/;
1093
1094    var document = node.ownerDocument;
1095
1096    var li = document.createElement('li');
1097    while (node.firstChild) {
1098      li.appendChild(node.firstChild);
1099    }
1100    // An array of lines.  We split below, so this is initialized to one
1101    // un-split line.
1102    var listItems = [li];
1103
1104    function walk(node) {
1105      var type = node.nodeType;
1106      if (type == 1 && !nocode.test(node.className)) {  // Element
1107        if ('br' === node.nodeName.toLowerCase()) {
1108          breakAfter(node);
1109          // Discard the <BR> since it is now flush against a </LI>.
1110          if (node.parentNode) {
1111            node.parentNode.removeChild(node);
1112          }
1113        } else {
1114          for (var child = node.firstChild; child; child = child.nextSibling) {
1115            walk(child);
1116          }
1117        }
1118      } else if ((type == 3 || type == 4) && isPreformatted) {  // Text
1119        var text = node.nodeValue;
1120        var match = text.match(lineBreak);
1121        if (match) {
1122          var firstLine = text.substring(0, match.index);
1123          node.nodeValue = firstLine;
1124          var tail = text.substring(match.index + match[0].length);
1125          if (tail) {
1126            var parent = node.parentNode;
1127            parent.insertBefore(
1128              document.createTextNode(tail), node.nextSibling);
1129          }
1130          breakAfter(node);
1131          if (!firstLine) {
1132            // Don't leave blank text nodes in the DOM.
1133            node.parentNode.removeChild(node);
1134          }
1135        }
1136      }
1137    }
1138
1139    // Split a line after the given node.
1140    function breakAfter(lineEndNode) {
1141      // If there's nothing to the right, then we can skip ending the line
1142      // here, and move root-wards since splitting just before an end-tag
1143      // would require us to create a bunch of empty copies.
1144      while (!lineEndNode.nextSibling) {
1145        lineEndNode = lineEndNode.parentNode;
1146        if (!lineEndNode) { return; }
1147      }
1148
1149      function breakLeftOf(limit, copy) {
1150        // Clone shallowly if this node needs to be on both sides of the break.
1151        var rightSide = copy ? limit.cloneNode(false) : limit;
1152        var parent = limit.parentNode;
1153        if (parent) {
1154          // We clone the parent chain.
1155          // This helps us resurrect important styling elements that cross lines.
1156          // E.g. in <i>Foo<br>Bar</i>
1157          // should be rewritten to <li><i>Foo</i></li><li><i>Bar</i></li>.
1158          var parentClone = breakLeftOf(parent, 1);
1159          // Move the clone and everything to the right of the original
1160          // onto the cloned parent.
1161          var next = limit.nextSibling;
1162          parentClone.appendChild(rightSide);
1163          for (var sibling = next; sibling; sibling = next) {
1164            next = sibling.nextSibling;
1165            parentClone.appendChild(sibling);
1166          }
1167        }
1168        return rightSide;
1169      }
1170
1171      var copiedListItem = breakLeftOf(lineEndNode.nextSibling, 0);
1172
1173      // Walk the parent chain until we reach an unattached LI.
1174      for (var parent;
1175           // Check nodeType since IE invents document fragments.
1176           (parent = copiedListItem.parentNode) && parent.nodeType === 1;) {
1177        copiedListItem = parent;
1178      }
1179      // Put it on the list of lines for later processing.
1180      listItems.push(copiedListItem);
1181    }
1182
1183    // Split lines while there are lines left to split.
1184    for (var i = 0;  // Number of lines that have been split so far.
1185         i < listItems.length;  // length updated by breakAfter calls.
1186         ++i) {
1187      walk(listItems[i]);
1188    }
1189
1190    // Make sure numeric indices show correctly.
1191    if (startLineNum === (startLineNum|0)) {
1192      listItems[0].setAttribute('value', startLineNum);
1193    }
1194
1195    var ol = document.createElement('ol');
1196    ol.className = 'linenums';
1197    var offset = Math.max(0, ((startLineNum - 1 /* zero index */)) | 0) || 0;
1198    for (var i = 0, n = listItems.length; i < n; ++i) {
1199      li = listItems[i];
1200      // Stick a class on the LIs so that stylesheets can
1201      // color odd/even rows, or any other row pattern that
1202      // is co-prime with 10.
1203      li.className = 'L' + ((i + offset) % 10);
1204      if (!li.firstChild) {
1205        li.appendChild(document.createTextNode('\xA0'));
1206      }
1207      ol.appendChild(li);
1208    }
1209
1210    node.appendChild(ol);
1211  }
1212
1213
1214  /**
1215   * Breaks {@code job.sourceCode} around style boundaries in
1216   * {@code job.decorations} and modifies {@code job.sourceNode} in place.
1217   * @param {JobT} job
1218   * @private
1219   */
1220  function recombineTagsAndDecorations(job) {
1221    var isIE8OrEarlier = /\bMSIE\s(\d+)/.exec(navigator.userAgent);
1222    isIE8OrEarlier = isIE8OrEarlier && +isIE8OrEarlier[1] <= 8;
1223    var newlineRe = /\n/g;
1224
1225    var source = job.sourceCode;
1226    var sourceLength = source.length;
1227    // Index into source after the last code-unit recombined.
1228    var sourceIndex = 0;
1229
1230    var spans = job.spans;
1231    var nSpans = spans.length;
1232    // Index into spans after the last span which ends at or before sourceIndex.
1233    var spanIndex = 0;
1234
1235    var decorations = job.decorations;
1236    var nDecorations = decorations.length;
1237    // Index into decorations after the last decoration which ends at or before
1238    // sourceIndex.
1239    var decorationIndex = 0;
1240
1241    // Remove all zero-length decorations.
1242    decorations[nDecorations] = sourceLength;
1243    var decPos, i;
1244    for (i = decPos = 0; i < nDecorations;) {
1245      if (decorations[i] !== decorations[i + 2]) {
1246        decorations[decPos++] = decorations[i++];
1247        decorations[decPos++] = decorations[i++];
1248      } else {
1249        i += 2;
1250      }
1251    }
1252    nDecorations = decPos;
1253
1254    // Simplify decorations.
1255    for (i = decPos = 0; i < nDecorations;) {
1256      var startPos = decorations[i];
1257      // Conflate all adjacent decorations that use the same style.
1258      var startDec = decorations[i + 1];
1259      var end = i + 2;
1260      while (end + 2 <= nDecorations && decorations[end + 1] === startDec) {
1261        end += 2;
1262      }
1263      decorations[decPos++] = startPos;
1264      decorations[decPos++] = startDec;
1265      i = end;
1266    }
1267
1268    nDecorations = decorations.length = decPos;
1269
1270    var sourceNode = job.sourceNode;
1271    var oldDisplay = "";
1272    if (sourceNode) {
1273      oldDisplay = sourceNode.style.display;
1274      sourceNode.style.display = 'none';
1275    }
1276    try {
1277      var decoration = null;
1278      while (spanIndex < nSpans) {
1279        var spanStart = spans[spanIndex];
1280        var spanEnd = /** @type{number} */ (spans[spanIndex + 2])
1281            || sourceLength;
1282
1283        var decEnd = decorations[decorationIndex + 2] || sourceLength;
1284
1285        var end = Math.min(spanEnd, decEnd);
1286
1287        var textNode = /** @type{Node} */ (spans[spanIndex + 1]);
1288        var styledText;
1289        if (textNode.nodeType !== 1  // Don't muck with <BR>s or <LI>s
1290            // Don't introduce spans around empty text nodes.
1291            && (styledText = source.substring(sourceIndex, end))) {
1292          // This may seem bizarre, and it is.  Emitting LF on IE causes the
1293          // code to display with spaces instead of line breaks.
1294          // Emitting Windows standard issue linebreaks (CRLF) causes a blank
1295          // space to appear at the beginning of every line but the first.
1296          // Emitting an old Mac OS 9 line separator makes everything spiffy.
1297          if (isIE8OrEarlier) {
1298            styledText = styledText.replace(newlineRe, '\r');
1299          }
1300          textNode.nodeValue = styledText;
1301          var document = textNode.ownerDocument;
1302          var span = document.createElement('span');
1303          span.className = decorations[decorationIndex + 1];
1304          var parentNode = textNode.parentNode;
1305          parentNode.replaceChild(span, textNode);
1306          span.appendChild(textNode);
1307          if (sourceIndex < spanEnd) {  // Split off a text node.
1308            spans[spanIndex + 1] = textNode
1309                // TODO: Possibly optimize by using '' if there's no flicker.
1310                = document.createTextNode(source.substring(end, spanEnd));
1311            parentNode.insertBefore(textNode, span.nextSibling);
1312          }
1313        }
1314
1315        sourceIndex = end;
1316
1317        if (sourceIndex >= spanEnd) {
1318          spanIndex += 2;
1319        }
1320        if (sourceIndex >= decEnd) {
1321          decorationIndex += 2;
1322        }
1323      }
1324    } finally {
1325      if (sourceNode) {
1326        sourceNode.style.display = oldDisplay;
1327      }
1328    }
1329  }
1330
1331
1332  /** Maps language-specific file extensions to handlers. */
1333  var langHandlerRegistry = {};
1334  /** Register a language handler for the given file extensions.
1335    * @param {function (JobT)} handler a function from source code to a list
1336    *      of decorations.  Takes a single argument job which describes the
1337    *      state of the computation and attaches the decorations to it.
1338    * @param {Array.<string>} fileExtensions
1339    */
1340  function registerLangHandler(handler, fileExtensions) {
1341    for (var i = fileExtensions.length; --i >= 0;) {
1342      var ext = fileExtensions[i];
1343      if (!langHandlerRegistry.hasOwnProperty(ext)) {
1344        langHandlerRegistry[ext] = handler;
1345      } else if (win['console']) {
1346        console['warn']('cannot override language handler %s', ext);
1347      }
1348    }
1349  }
1350  function langHandlerForExtension(extension, source) {
1351    if (!(extension && langHandlerRegistry.hasOwnProperty(extension))) {
1352      // Treat it as markup if the first non whitespace character is a < and
1353      // the last non-whitespace character is a >.
1354      extension = /^\s*</.test(source)
1355          ? 'default-markup'
1356          : 'default-code';
1357    }
1358    return langHandlerRegistry[extension];
1359  }
1360  registerLangHandler(decorateSource, ['default-code']);
1361  registerLangHandler(
1362      createSimpleLexer(
1363          [],
1364          [
1365           [PR_PLAIN,       /^[^<?]+/],
1366           [PR_DECLARATION, /^<!\w[^>]*(?:>|$)/],
1367           [PR_COMMENT,     /^<\!--[\s\S]*?(?:-\->|$)/],
1368           // Unescaped content in an unknown language
1369           ['lang-',        /^<\?([\s\S]+?)(?:\?>|$)/],
1370           ['lang-',        /^<%([\s\S]+?)(?:%>|$)/],
1371           [PR_PUNCTUATION, /^(?:<[%?]|[%?]>)/],
1372           ['lang-',        /^<xmp\b[^>]*>([\s\S]+?)<\/xmp\b[^>]*>/i],
1373           // Unescaped content in javascript.  (Or possibly vbscript).
1374           ['lang-js',      /^<script\b[^>]*>([\s\S]*?)(<\/script\b[^>]*>)/i],
1375           // Contains unescaped stylesheet content
1376           ['lang-css',     /^<style\b[^>]*>([\s\S]*?)(<\/style\b[^>]*>)/i],
1377           ['lang-in.tag',  /^(<\/?[a-z][^<>]*>)/i]
1378          ]),
1379      ['default-markup', 'htm', 'html', 'mxml', 'xhtml', 'xml', 'xsl']);
1380  registerLangHandler(
1381      createSimpleLexer(
1382          [
1383           [PR_PLAIN,        /^[\s]+/, null, ' \t\r\n'],
1384           [PR_ATTRIB_VALUE, /^(?:\"[^\"]*\"?|\'[^\']*\'?)/, null, '\"\'']
1385           ],
1386          [
1387           [PR_TAG,          /^^<\/?[a-z](?:[\w.:-]*\w)?|\/?>$/i],
1388           [PR_ATTRIB_NAME,  /^(?!style[\s=]|on)[a-z](?:[\w:-]*\w)?/i],
1389           ['lang-uq.val',   /^=\s*([^>\'\"\s]*(?:[^>\'\"\s\/]|\/(?=\s)))/],
1390           [PR_PUNCTUATION,  /^[=<>\/]+/],
1391           ['lang-js',       /^on\w+\s*=\s*\"([^\"]+)\"/i],
1392           ['lang-js',       /^on\w+\s*=\s*\'([^\']+)\'/i],
1393           ['lang-js',       /^on\w+\s*=\s*([^\"\'>\s]+)/i],
1394           ['lang-css',      /^style\s*=\s*\"([^\"]+)\"/i],
1395           ['lang-css',      /^style\s*=\s*\'([^\']+)\'/i],
1396           ['lang-css',      /^style\s*=\s*([^\"\'>\s]+)/i]
1397           ]),
1398      ['in.tag']);
1399  registerLangHandler(
1400      createSimpleLexer([], [[PR_ATTRIB_VALUE, /^[\s\S]+/]]), ['uq.val']);
1401  registerLangHandler(sourceDecorator({
1402          'keywords': CPP_KEYWORDS,
1403          'hashComments': true,
1404          'cStyleComments': true,
1405          'types': C_TYPES
1406        }), ['c', 'cc', 'cpp', 'cxx', 'cyc', 'm']);
1407  registerLangHandler(sourceDecorator({
1408          'keywords': 'null,true,false'
1409        }), ['json']);
1410  registerLangHandler(sourceDecorator({
1411          'keywords': CSHARP_KEYWORDS,
1412          'hashComments': true,
1413          'cStyleComments': true,
1414          'verbatimStrings': true,
1415          'types': C_TYPES
1416        }), ['cs']);
1417  registerLangHandler(sourceDecorator({
1418          'keywords': JAVA_KEYWORDS,
1419          'cStyleComments': true
1420        }), ['java']);
1421  registerLangHandler(sourceDecorator({
1422          'keywords': SH_KEYWORDS,
1423          'hashComments': true,
1424          'multiLineStrings': true
1425        }), ['bash', 'bsh', 'csh', 'sh']);
1426  registerLangHandler(sourceDecorator({
1427          'keywords': PYTHON_KEYWORDS,
1428          'hashComments': true,
1429          'multiLineStrings': true,
1430          'tripleQuotedStrings': true
1431        }), ['cv', 'py', 'python']);
1432  registerLangHandler(sourceDecorator({
1433          'keywords': PERL_KEYWORDS,
1434          'hashComments': true,
1435          'multiLineStrings': true,
1436          'regexLiterals': 2  // multiline regex literals
1437        }), ['perl', 'pl', 'pm']);
1438  registerLangHandler(sourceDecorator({
1439          'keywords': RUBY_KEYWORDS,
1440          'hashComments': true,
1441          'multiLineStrings': true,
1442          'regexLiterals': true
1443        }), ['rb', 'ruby']);
1444  registerLangHandler(sourceDecorator({
1445          'keywords': JSCRIPT_KEYWORDS,
1446          'cStyleComments': true,
1447          'regexLiterals': true
1448        }), ['javascript', 'js', 'ts', 'typescript']);
1449  registerLangHandler(sourceDecorator({
1450          'keywords': COFFEE_KEYWORDS,
1451          'hashComments': 3,  // ### style block comments
1452          'cStyleComments': true,
1453          'multilineStrings': true,
1454          'tripleQuotedStrings': true,
1455          'regexLiterals': true
1456        }), ['coffee']);
1457  registerLangHandler(
1458      createSimpleLexer([], [[PR_STRING, /^[\s\S]+/]]), ['regex']);
1459
1460  /** @param {JobT} job */
1461  function applyDecorator(job) {
1462    var opt_langExtension = job.langExtension;
1463
1464    try {
1465      // Extract tags, and convert the source code to plain text.
1466      var sourceAndSpans = extractSourceSpans(job.sourceNode, job.pre);
1467      /** Plain text. @type {string} */
1468      var source = sourceAndSpans.sourceCode;
1469      job.sourceCode = source;
1470      job.spans = sourceAndSpans.spans;
1471      job.basePos = 0;
1472
1473      // Apply the appropriate language handler
1474      langHandlerForExtension(opt_langExtension, source)(job);
1475
1476      // Integrate the decorations and tags back into the source code,
1477      // modifying the sourceNode in place.
1478      recombineTagsAndDecorations(job);
1479    } catch (e) {
1480      if (win['console']) {
1481        console['log'](e && e['stack'] || e);
1482      }
1483    }
1484  }
1485
1486  /**
1487   * Pretty print a chunk of code.
1488   * @param sourceCodeHtml {string} The HTML to pretty print.
1489   * @param opt_langExtension {string} The language name to use.
1490   *     Typically, a filename extension like 'cpp' or 'java'.
1491   * @param opt_numberLines {number|boolean} True to number lines,
1492   *     or the 1-indexed number of the first line in sourceCodeHtml.
1493   */
1494  function $prettyPrintOne(sourceCodeHtml, opt_langExtension, opt_numberLines) {
1495    /** @type{number|boolean} */
1496    var nl = opt_numberLines || false;
1497    /** @type{string|null} */
1498    var langExtension = opt_langExtension || null;
1499    /** @type{!Element} */
1500    var container = document.createElement('div');
1501    // This could cause images to load and onload listeners to fire.
1502    // E.g. <img onerror="alert(1337)" src="nosuchimage.png">.
1503    // We assume that the inner HTML is from a trusted source.
1504    // The pre-tag is required for IE8 which strips newlines from innerHTML
1505    // when it is injected into a <pre> tag.
1506    // http://stackoverflow.com/questions/451486/pre-tag-loses-line-breaks-when-setting-innerhtml-in-ie
1507    // http://stackoverflow.com/questions/195363/inserting-a-newline-into-a-pre-tag-ie-javascript
1508    container.innerHTML = '<pre>' + sourceCodeHtml + '</pre>';
1509    container = /** @type{!Element} */(container.firstChild);
1510    if (nl) {
1511      numberLines(container, nl, true);
1512    }
1513
1514    /** @type{JobT} */
1515    var job = {
1516      langExtension: langExtension,
1517      numberLines: nl,
1518      sourceNode: container,
1519      pre: 1,
1520      sourceCode: null,
1521      basePos: null,
1522      spans: null,
1523      decorations: null
1524    };
1525    applyDecorator(job);
1526    return container.innerHTML;
1527  }
1528
1529   /**
1530    * Find all the {@code <pre>} and {@code <code>} tags in the DOM with
1531    * {@code class=prettyprint} and prettify them.
1532    *
1533    * @param {Function} opt_whenDone called when prettifying is done.
1534    * @param {HTMLElement|HTMLDocument} opt_root an element or document
1535    *   containing all the elements to pretty print.
1536    *   Defaults to {@code document.body}.
1537    */
1538  function $prettyPrint(opt_whenDone, opt_root) {
1539    var root = opt_root || document.body;
1540    var doc = root.ownerDocument || document;
1541    function byTagName(tn) { return root.getElementsByTagName(tn); }
1542    // fetch a list of nodes to rewrite
1543    var codeSegments = [byTagName('pre'), byTagName('code'), byTagName('xmp')];
1544    var elements = [];
1545    for (var i = 0; i < codeSegments.length; ++i) {
1546      for (var j = 0, n = codeSegments[i].length; j < n; ++j) {
1547        elements.push(codeSegments[i][j]);
1548      }
1549    }
1550    codeSegments = null;
1551
1552    var clock = Date;
1553    if (!clock['now']) {
1554      clock = { 'now': function () { return +(new Date); } };
1555    }
1556
1557    // The loop is broken into a series of continuations to make sure that we
1558    // don't make the browser unresponsive when rewriting a large page.
1559    var k = 0;
1560
1561    var langExtensionRe = /\blang(?:uage)?-([\w.]+)(?!\S)/;
1562    var prettyPrintRe = /\bprettyprint\b/;
1563    var prettyPrintedRe = /\bprettyprinted\b/;
1564    var preformattedTagNameRe = /pre|xmp/i;
1565    var codeRe = /^code$/i;
1566    var preCodeXmpRe = /^(?:pre|code|xmp)$/i;
1567    var EMPTY = {};
1568
1569    function doWork() {
1570      var endTime = (win['PR_SHOULD_USE_CONTINUATION'] ?
1571                     clock['now']() + 250 /* ms */ :
1572                     Infinity);
1573      for (; k < elements.length && clock['now']() < endTime; k++) {
1574        var cs = elements[k];
1575
1576        // Look for a preceding comment like
1577        // <?prettify lang="..." linenums="..."?>
1578        var attrs = EMPTY;
1579        {
1580          for (var preceder = cs; (preceder = preceder.previousSibling);) {
1581            var nt = preceder.nodeType;
1582            // <?foo?> is parsed by HTML 5 to a comment node (8)
1583            // like <!--?foo?-->, but in XML is a processing instruction
1584            var value = (nt === 7 || nt === 8) && preceder.nodeValue;
1585            if (value
1586                ? !/^\??prettify\b/.test(value)
1587                : (nt !== 3 || /\S/.test(preceder.nodeValue))) {
1588              // Skip over white-space text nodes but not others.
1589              break;
1590            }
1591            if (value) {
1592              attrs = {};
1593              value.replace(
1594                  /\b(\w+)=([\w:.%+-]+)/g,
1595                function (_, name, value) { attrs[name] = value; });
1596              break;
1597            }
1598          }
1599        }
1600
1601        var className = cs.className;
1602        if ((attrs !== EMPTY || prettyPrintRe.test(className))
1603            // Don't redo this if we've already done it.
1604            // This allows recalling pretty print to just prettyprint elements
1605            // that have been added to the page since last call.
1606            && !prettyPrintedRe.test(className)) {
1607
1608          // make sure this is not nested in an already prettified element
1609          var nested = false;
1610          for (var p = cs.parentNode; p; p = p.parentNode) {
1611            var tn = p.tagName;
1612            if (preCodeXmpRe.test(tn)
1613                && p.className && prettyPrintRe.test(p.className)) {
1614              nested = true;
1615              break;
1616            }
1617          }
1618          if (!nested) {
1619            // Mark done.  If we fail to prettyprint for whatever reason,
1620            // we shouldn't try again.
1621            cs.className += ' prettyprinted';
1622
1623            // If the classes includes a language extensions, use it.
1624            // Language extensions can be specified like
1625            //     <pre class="prettyprint lang-cpp">
1626            // the language extension "cpp" is used to find a language handler
1627            // as passed to PR.registerLangHandler.
1628            // HTML5 recommends that a language be specified using "language-"
1629            // as the prefix instead.  Google Code Prettify supports both.
1630            // http://dev.w3.org/html5/spec-author-view/the-code-element.html
1631            var langExtension = attrs['lang'];
1632            if (!langExtension) {
1633              langExtension = className.match(langExtensionRe);
1634              // Support <pre class="prettyprint"><code class="language-c">
1635              var wrapper;
1636              if (!langExtension && (wrapper = childContentWrapper(cs))
1637                  && codeRe.test(wrapper.tagName)) {
1638                langExtension = wrapper.className.match(langExtensionRe);
1639              }
1640
1641              if (langExtension) { langExtension = langExtension[1]; }
1642            }
1643
1644            var preformatted;
1645            if (preformattedTagNameRe.test(cs.tagName)) {
1646              preformatted = 1;
1647            } else {
1648              var currentStyle = cs['currentStyle'];
1649              var defaultView = doc.defaultView;
1650              var whitespace = (
1651                  currentStyle
1652                  ? currentStyle['whiteSpace']
1653                  : (defaultView
1654                     && defaultView.getComputedStyle)
1655                  ? defaultView.getComputedStyle(cs, null)
1656                  .getPropertyValue('white-space')
1657                  : 0);
1658              preformatted = whitespace
1659                  && 'pre' === whitespace.substring(0, 3);
1660            }
1661
1662            // Look for a class like linenums or linenums:<n> where <n> is the
1663            // 1-indexed number of the first line.
1664            var lineNums = attrs['linenums'];
1665            if (!(lineNums = lineNums === 'true' || +lineNums)) {
1666              lineNums = className.match(/\blinenums\b(?::(\d+))?/);
1667              lineNums =
1668                lineNums
1669                ? lineNums[1] && lineNums[1].length
1670                  ? +lineNums[1] : true
1671                : false;
1672            }
1673            if (lineNums) { numberLines(cs, lineNums, preformatted); }
1674
1675            // do the pretty printing
1676            var prettyPrintingJob = {
1677              langExtension: langExtension,
1678              sourceNode: cs,
1679              numberLines: lineNums,
1680              pre: preformatted,
1681              sourceCode: null,
1682              basePos: null,
1683              spans: null,
1684              decorations: null
1685            };
1686            applyDecorator(prettyPrintingJob);
1687          }
1688        }
1689      }
1690      if (k < elements.length) {
1691        // finish up in a continuation
1692        win.setTimeout(doWork, 250);
1693      } else if ('function' === typeof opt_whenDone) {
1694        opt_whenDone();
1695      }
1696    }
1697
1698    doWork();
1699  }
1700
1701  /**
1702   * Contains functions for creating and registering new language handlers.
1703   * @type {Object}
1704   */
1705  var PR = win['PR'] = {
1706        'createSimpleLexer': createSimpleLexer,
1707        'registerLangHandler': registerLangHandler,
1708        'sourceDecorator': sourceDecorator,
1709        'PR_ATTRIB_NAME': PR_ATTRIB_NAME,
1710        'PR_ATTRIB_VALUE': PR_ATTRIB_VALUE,
1711        'PR_COMMENT': PR_COMMENT,
1712        'PR_DECLARATION': PR_DECLARATION,
1713        'PR_KEYWORD': PR_KEYWORD,
1714        'PR_LITERAL': PR_LITERAL,
1715        'PR_NOCODE': PR_NOCODE,
1716        'PR_PLAIN': PR_PLAIN,
1717        'PR_PUNCTUATION': PR_PUNCTUATION,
1718        'PR_SOURCE': PR_SOURCE,
1719        'PR_STRING': PR_STRING,
1720        'PR_TAG': PR_TAG,
1721        'PR_TYPE': PR_TYPE,
1722        'prettyPrintOne':
1723           IN_GLOBAL_SCOPE
1724             ? (win['prettyPrintOne'] = $prettyPrintOne)
1725             : (prettyPrintOne = $prettyPrintOne),
1726        'prettyPrint':
1727           IN_GLOBAL_SCOPE
1728             ? (win['prettyPrint'] = $prettyPrint)
1729             : (prettyPrint = $prettyPrint)
1730      };
1731
1732  // Make PR available via the Asynchronous Module Definition (AMD) API.
1733  // Per https://github.com/amdjs/amdjs-api/wiki/AMD:
1734  // The Asynchronous Module Definition (AMD) API specifies a
1735  // mechanism for defining modules such that the module and its
1736  // dependencies can be asynchronously loaded.
1737  // ...
1738  // To allow a clear indicator that a global define function (as
1739  // needed for script src browser loading) conforms to the AMD API,
1740  // any global define function SHOULD have a property called "amd"
1741  // whose value is an object. This helps avoid conflict with any
1742  // other existing JavaScript code that could have defined a define()
1743  // function that does not conform to the AMD API.
1744  var define = win['define'];
1745  if (typeof define === "function" && define['amd']) {
1746    define("google-code-prettify", [], function () {
1747      return PR;
1748    });
1749  }
1750})();
1751