1// Copyright (C) 2006 Google Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15 16/** 17 * @fileoverview 18 * some functions for browser-side pretty printing of code contained in html. 19 * 20 * The lexer should work on a number of languages including C and friends, 21 * Java, Python, Bash, SQL, HTML, XML, CSS, Javascript, and Makefiles. 22 * It works passably on Ruby, PHP and Awk and a decent subset of Perl, but, 23 * because of commenting conventions, doesn't work on Smalltalk, Lisp-like, or 24 * CAML-like languages. 25 * 26 * If there's a language not mentioned here, then I don't know it, and don't 27 * know whether it works. If it has a C-like, Bash-like, or XML-like syntax 28 * then it should work passably. 29 * 30 * Usage: 31 * 1) include this source file in an html page via 32 * <script type="text/javascript" src="/path/to/prettify.js"></script> 33 * 2) define style rules. See the example page for examples. 34 * 3) mark the <pre> and <code> tags in your source with class=prettyprint. 35 * You can also use the (html deprecated) <xmp> tag, but the pretty printer 36 * needs to do more substantial DOM manipulations to support that, so some 37 * css styles may not be preserved. 38 * That's it. I wanted to keep the API as simple as possible, so there's no 39 * need to specify which language the code is in. 40 * 41 * Change log: 42 * cbeust, 2006/08/22 43 * Java annotations (start with "@") are now captured as literals ("lit") 44 */ 45 46var PR_keywords = {}; 47/** initialize the keyword list for our target languages. */ 48(function () { 49 var CPP_KEYWORDS = "abstract bool break case catch char class const " + 50 "const_cast continue default delete deprecated dllexport dllimport do " + 51 "double dynamic_cast else enum explicit extern false float for friend " + 52 "goto if inline int long mutable naked namespace new noinline noreturn " + 53 "nothrow novtable operator private property protected public register " + 54 "reinterpret_cast return selectany short signed sizeof static " + 55 "static_cast struct switch template this thread throw true try typedef " + 56 "typeid typename union unsigned using declaration, directive uuid " + 57 "virtual void volatile while typeof"; 58 var CSHARP_KEYWORDS = "as base by byte checked decimal delegate descending " + 59 "event finally fixed foreach from group implicit in interface internal " + 60 "into is lock null object out override orderby params readonly ref sbyte " + 61 "sealed stackalloc string select uint ulong unchecked unsafe ushort var"; 62 var JAVA_KEYWORDS = "package synchronized boolean implements import throws " + 63 "instanceof transient extends final strictfp native super"; 64 var JSCRIPT_KEYWORDS = "debugger export function with NaN Infinity"; 65 var PERL_KEYWORDS = "require sub unless until use elsif BEGIN END"; 66 var PYTHON_KEYWORDS = "and assert def del elif except exec global lambda " + 67 "not or pass print raise yield False True None"; 68 var RUBY_KEYWORDS = "then end begin rescue ensure module when undef next " + 69 "redo retry alias defined"; 70 var SH_KEYWORDS = "done fi"; 71 72 var KEYWORDS = [CPP_KEYWORDS, CSHARP_KEYWORDS, JAVA_KEYWORDS, 73 JSCRIPT_KEYWORDS, PERL_KEYWORDS, PYTHON_KEYWORDS, 74 RUBY_KEYWORDS, SH_KEYWORDS]; 75 for (var k = 0; k < KEYWORDS.length; k++) { 76 var kw = KEYWORDS[k].split(' '); 77 for (var i = 0; i < kw.length; i++) { 78 if (kw[i]) { PR_keywords[kw[i]] = true; } 79 } 80 } 81}).call(this); 82 83// token style names. correspond to css classes 84/** token style for a string literal */ 85var PR_STRING = 'str'; 86/** token style for a keyword */ 87var PR_KEYWORD = 'kwd'; 88/** token style for a comment */ 89var PR_COMMENT = 'com'; 90/** token style for a type */ 91var PR_TYPE = 'typ'; 92/** token style for a literal value. e.g. 1, null, true. */ 93var PR_LITERAL = 'lit'; 94/** token style for a punctuation string. */ 95var PR_PUNCTUATION = 'pun'; 96/** token style for a plain string. */ 97var PR_PLAIN = 'pln'; 98/** token style for a regexp. */ 99var PR_REGEX = 'reg'; 100/** token style for a constants. */ 101var PR_CONSTANT = 'const'; 102 103 104/** token style for an sgml tag. */ 105var PR_TAG = 'tag'; 106/** token style for a markup declaration such as a DOCTYPE. */ 107var PR_DECLARATION = 'dec'; 108/** token style for embedded source. */ 109var PR_SOURCE = 'src'; 110/** token style for an sgml attribute name. */ 111var PR_ATTRIB_NAME = 'atn'; 112/** token style for an sgml attribute value. */ 113var PR_ATTRIB_VALUE = 'atv'; 114/** token style for an new line. */ 115var PR_NL = 'nl'; 116 117/** the number of characters between tab columns */ 118var PR_TAB_WIDTH = 8; 119 120/** the position of the end of a token during. A division of a string into 121 * n tokens can be represented as a series n - 1 token ends, as long as 122 * runs of whitespace warrant their own token. 123 * @private 124 */ 125function PR_TokenEnd(end, style) { 126 if (undefined === style) { throw new Error('BAD'); } 127 if ('number' != typeof(end)) { throw new Error('BAD'); } 128 this.end = end; 129 this.style = style; 130} 131PR_TokenEnd.prototype.toString = function () { 132 return '[PR_TokenEnd ' + this.end + 133 (this.style ? ':' + this.style : '') + ']'; 134}; 135 136 137/** a chunk of text with a style. These are used to represent both the output 138 * from the lexing functions as well as intermediate results. 139 * @constructor 140 * @param token the token text 141 * @param style one of the token styles defined in designdoc-template, or null 142 * for a styleless token, such as an embedded html tag. 143 * @private 144 */ 145function PR_Token(token, style) { 146 if (undefined === style) { throw new Error('BAD'); } 147 this.token = token; 148 this.style = style; 149} 150 151PR_Token.prototype.toString = function () { 152 return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']'; 153}; 154 155 156/** a helper class that decodes common html entities used to escape special 157 * characters in source code. 158 * @constructor 159 * @private 160 */ 161function PR_DecodeHelper() { 162 this.next = 0; 163 this.ch = '\0'; 164} 165 166var PR_NAMED_ENTITIES = { 167 'lt': '<', 168 'gt': '>', 169 'quot': '"', 170 'apos': "'", 171 'amp': '&' // reencoding requires that & always be decoded properly 172}; 173 174PR_DecodeHelper.prototype.decode = function (s, i) { 175 var next = i + 1; 176 var ch = s.charAt(i); 177 if ('&' === ch) { 178 var semi = s.indexOf(';', next); 179 if (semi >= 0 && semi < next + 4) { 180 var entityName = s.substring(next, semi); 181 var decoded = null; 182 if (entityName.charAt(0) === '#') { // check for numeric entity 183 var ch1 = entityName.charAt(1); 184 var charCode; 185 if (ch1 === 'x' || ch1 === 'X') { // like   186 charCode = parseInt(entityName.substring(2), 16); 187 } else { // like   188 charCode = parseInt(entityName.substring(1), 10); 189 } 190 if (!isNaN(charCode)) { 191 decoded = String.fromCharCode(charCode); 192 } 193 } 194 if (!decoded) { 195 decoded = PR_NAMED_ENTITIES[entityName.toLowerCase()]; 196 } 197 if (decoded) { 198 ch = decoded; 199 next = semi + 1; 200 } else { // skip over unrecognized entity 201 next = i + 1; 202 ch = '\0'; 203 } 204 } 205 } 206 this.next = next; 207 this.ch = ch; 208 return this.ch; 209}; 210 211 212// some string utilities 213function PR_isWordChar(ch) { 214 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 215} 216 217function PR_isIdentifierStart(ch) { 218 return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@'; 219} 220 221function PR_isIdentifierPart(ch) { 222 return PR_isIdentifierStart(ch) || PR_isDigitChar(ch); 223} 224 225function PR_isSpaceChar(ch) { 226 return "\t \r\n".indexOf(ch) >= 0; 227} 228 229function PR_isDigitChar(ch) { 230 return ch >= '0' && ch <= '9'; 231} 232 233function PR_trim(s) { 234 var i = 0, j = s.length - 1; 235 while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; } 236 while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; } 237 return s.substring(i, j + 1); 238} 239 240function PR_startsWith(s, prefix) { 241 return s.length >= prefix.length && prefix == s.substring(0, prefix.length); 242} 243 244function PR_endsWith(s, suffix) { 245 return s.length >= suffix.length && 246 suffix == s.substring(s.length - suffix.length, s.length); 247} 248 249/** a set of tokens that can precede a regular expression literal in javascript. 250 * http://www.mozilla.org/js/language/js20/rationale/syntax.html has the full 251 * list, but I've removed ones that might be problematic when seen in languages 252 * that don't support regular expression literals. 253 * 254 * <p>Specifically, I've removed any keywords that can't precede a regexp 255 * literal in a syntactically legal javascript program, and I've removed the 256 * "in" keyword since it's not a keyword in many languages, and might be used 257 * as a count of inches. 258 * @private 259 */ 260var REGEXP_PRECEDER_PATTERN = (function () { 261 var preceders = [ 262 "!", "!=", "!==", "#", "%", "%=", "&", "&&", "&&=", 263 "&=", "(", "*", "*=", /* "+", */ "+=", ",", /* "-", */ "-=", 264 "->", /*".", "..", "...", handled below */ "/", "/=", ":", "::", ";", 265 "<", "<<", "<<=", "<=", "=", "==", "===", ">", 266 ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", 267 "^", "^=", "^^", "^^=", "{", "|", "|=", "||", 268 "||=", "~", "break", "case", "continue", "delete", 269 "do", "else", "finally", "instanceof", 270 "return", "throw", "try", "typeof" 271 ]; 272 var pattern = '(?:' + 273 '(?:(?:^|[^0-9\.])\\.{1,3})|' + // a dot that's not part of a number 274 '(?:(?:^|[^\\+])\\+)|' + // allow + but not ++ 275 '(?:(?:^|[^\\-])-)' // allow - but not -- 276 ; 277 for (var i = 0; i < preceders.length; ++i) { 278 var preceder = preceders[i]; 279 if (PR_isWordChar(preceder.charAt(0))) { 280 pattern += '|\\b' + preceder; 281 } else { 282 pattern += '|' + preceder.replace(/([^=<>:&])/g, '\\$1'); 283 } 284 } 285 pattern += ')\\s*$'; // matches at end 286 return new RegExp(pattern); 287 // CAVEAT: this does not properly handle the case where a regular expression 288 // immediately follows another since a regular expression may have flags 289 // for case-sensitivity and the like. Having regexp tokens adjacent is not 290 // valid in any language I'm aware of, so I'm punting. 291 // TODO: maybe style special characters inside a regexp as punctuation. 292 })(); 293 294/** true iff prefix matches the first prefix characters in chars[0:len]. 295 * @private 296 */ 297function PR_prefixMatch(chars, len, prefix) { 298 if (len < prefix.length) { return false; } 299 for (var i = 0, n = prefix.length; i < n; ++i) { 300 if (prefix.charAt(i) != chars[i]) { return false; } 301 } 302 return true; 303} 304 305/** like textToHtml but escapes double quotes to be attribute safe. */ 306function PR_attribToHtml(str) { 307 return str.replace(/&/g, '&') 308 .replace(/</g, '<') 309 .replace(/>/g, '>') 310 .replace(/\"/g, '"') 311 .replace(/\xa0/, ' '); 312} 313 314/** escapest html special characters to html. */ 315function PR_textToHtml(str) { 316 return str.replace(/&/g, '&') 317 .replace(/</g, '<') 318 .replace(/>/g, '>') 319 .replace(/\xa0/g, ' '); 320} 321 322/** is the given node's innerHTML normally unescaped? */ 323function PR_isRawContent(node) { 324 return 'XMP' == node.tagName; 325} 326 327var PR_innerHtmlWorks = null; 328function PR_getInnerHtml(node) { 329 // inner html is hopelessly broken in Safari 2.0.4 when the content is 330 // an html description of well formed XML and the containing tag is a PRE 331 // tag, so we detect that case and emulate innerHTML. 332 if (null == PR_innerHtmlWorks) { 333 var testNode = document.createElement('PRE'); 334 testNode.appendChild( 335 document.createTextNode('<!DOCTYPE foo PUBLIC "foo bar">\n<foo />')); 336 PR_innerHtmlWorks = !/</.test(testNode.innerHTML); 337 } 338 339 if (PR_innerHtmlWorks) { 340 var content = node.innerHTML; 341 // XMP tags contain unescaped entities so require special handling. 342 if (PR_isRawContent(node)) { 343 content = PR_textToHtml(content); 344 } 345 return content; 346 } 347 348 var out = []; 349 for (var child = node.firstChild; child; child = child.nextSibling) { 350 PR_normalizedHtml(child, out); 351 } 352 return out.join(''); 353} 354 355/** 356 * walks the DOM returning a properly escaped version of innerHTML. 357 */ 358function PR_normalizedHtml(node, out) { 359 switch (node.nodeType) { 360 case 1: // an element 361 var name = node.tagName.toLowerCase(); 362 out.push('\074', name); 363 for (var i = 0; i < node.attributes.length; ++i) { 364 var attr = node.attributes[i]; 365 if (!attr.specified) { continue; } 366 out.push(' '); 367 PR_normalizedHtml(attr, out); 368 } 369 out.push('>'); 370 for (var child = node.firstChild; child; child = child.nextSibling) { 371 PR_normalizedHtml(child, out); 372 } 373 if (node.firstChild || !/^(?:br|link|img)$/.test(name)) { 374 out.push('<\/', name, '>'); 375 } 376 break; 377 case 2: // an attribute 378 out.push(node.name.toLowerCase(), '="', PR_attribToHtml(node.value), '"'); 379 break; 380 case 3: case 4: // text 381 out.push(PR_textToHtml(node.nodeValue)); 382 break; 383 } 384} 385 386/** expand tabs to spaces 387 * @param {Array} chunks PR_Tokens possibly containing tabs 388 * @param {Number} tabWidth number of spaces between tab columns 389 * @return {Array} chunks with tabs replaced with spaces 390 */ 391function PR_expandTabs(chunks, tabWidth) { 392 var SPACES = ' '; 393 394 var charInLine = 0; 395 var decodeHelper = new PR_DecodeHelper(); 396 397 var chunksOut = [] 398 for (var chunkIndex = 0; chunkIndex < chunks.length; ++chunkIndex) { 399 var chunk = chunks[chunkIndex]; 400 if (chunk.style == null) { 401 chunksOut.push(chunk); 402 continue; 403 } 404 405 var s = chunk.token; 406 var pos = 0; // index of last character output 407 var out = []; 408 409 // walk over each character looking for tabs and newlines. 410 // On tabs, expand them. On newlines, reset charInLine. 411 // Otherwise increment charInLine 412 for (var charIndex = 0, n = s.length; charIndex < n; 413 charIndex = decodeHelper.next) { 414 decodeHelper.decode(s, charIndex); 415 var ch = decodeHelper.ch; 416 417 switch (ch) { 418 case '\t': 419 out.push(s.substring(pos, charIndex)); 420 // calculate how much space we need in front of this part 421 // nSpaces is the amount of padding -- the number of spaces needed to 422 // move us to the next column, where columns occur at factors of 423 // tabWidth. 424 var nSpaces = tabWidth - (charInLine % tabWidth); 425 charInLine += nSpaces; 426 for (; nSpaces >= 0; nSpaces -= SPACES.length) { 427 out.push(SPACES.substring(0, nSpaces)); 428 } 429 pos = decodeHelper.next; 430 break; 431 case '\n': case '\r': 432 charInLine = 0; 433 break; 434 default: 435 ++charInLine; 436 } 437 } 438 out.push(s.substring(pos)); 439 chunksOut.push(new PR_Token(out.join(''), chunk.style)); 440 } 441 return chunksOut 442} 443 444/** split markup into chunks of html tags (style null) and 445 * plain text (style {@link #PR_PLAIN}). 446 * 447 * @param {String} s html. 448 * @return {Array} of PR_Tokens of style PR_PLAIN, and null. 449 * @private 450 */ 451function PR_chunkify(s) { 452 // The below pattern matches one of the following 453 // (1) /[^<]+/ : A run of characters other than '<' 454 // (2) /<\/?[a-zA-Z][^>]*>/ : A probably tag that should not be highlighted 455 // (3) /</ : A '<' that does not begin a larger chunk. Treated as 1 456 var chunkPattern = /(?:[^<]+|<\/?[a-zA-Z][^>]*>|<)/g; 457 // since the pattern has the 'g' modifier and defines no capturing groups, 458 // this will return a list of all chunks which we then classify and wrap as 459 // PR_Tokens 460 var matches = s.match(chunkPattern); 461 var chunks = []; 462 if (matches) { 463 var lastChunk = null; 464 for (var i = 0, n = matches.length; i < n; ++i) { 465 var chunkText = matches[i]; 466 var style; 467 if (chunkText.length < 2 || chunkText.charAt(0) !== '<') { 468 if (lastChunk && lastChunk.style === PR_PLAIN) { 469 lastChunk.token += chunkText; 470 continue; 471 } 472 style = PR_PLAIN; 473 } else { // a tag 474 style = null; 475 } 476 lastChunk = new PR_Token(chunkText, style); 477 chunks.push(lastChunk); 478 } 479 } 480 return chunks; 481} 482 483/** walk the tokenEnds list and the chunk list in parallel to generate a list 484 * of split tokens. 485 * @private 486 */ 487function PR_splitChunks(chunks, tokenEnds) { 488 var tokens = []; // the output 489 490 var ci = 0; // index into chunks 491 // position of beginning of amount written so far in absolute space. 492 var posAbs = 0; 493 // position of amount written so far in chunk space 494 var posChunk = 0; 495 496 // current chunk 497 var chunk = new PR_Token('', null); 498 for (var ei = 0, ne = tokenEnds.length, lastEnd = 0; ei < ne; ++ei) { 499 var tokenEnd = tokenEnds[ei]; 500 var end = tokenEnd.end; 501 if (end === lastEnd) { continue; } // skip empty regions 502 503 var tokLen = end - posAbs; 504 var remainingInChunk = chunk.token.length - posChunk; 505 while (remainingInChunk <= tokLen) { 506 if (remainingInChunk > 0) { 507 tokens.push( 508 new PR_Token(chunk.token.substring(posChunk, chunk.token.length), 509 null == chunk.style ? null : tokenEnd.style)); 510 } 511 posAbs += remainingInChunk; 512 posChunk = 0; 513 if (ci < chunks.length) { 514 chunk = chunks[ci++]; 515 } 516 517 tokLen = end - posAbs; 518 remainingInChunk = chunk.token.length - posChunk; 519 } 520 521 if (tokLen) { 522 tokens.push( 523 new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen), 524 tokenEnd.style)); 525 posAbs += tokLen; 526 posChunk += tokLen; 527 } 528 } 529 530 return tokens; 531} 532 533/** splits markup tokens into declarations, tags, and source chunks. 534 * @private 535 */ 536function PR_splitMarkup(chunks) { 537 // A state machine to split out declarations, tags, etc. 538 // This state machine deals with absolute space in the text, indexed by k, 539 // and position in the current chunk, indexed by pos and tokenStart to 540 // generate a list of the ends of tokens. 541 // Absolute space is calculated by considering the chunks as appended into 542 // one big string, as they were before being split. 543 544 // Known failure cases 545 // Server side scripting sections such as <?...?> in attributes. 546 // i.e. <span class="<? foo ?>"> 547 // Handling this would require a stack, and we don't use PHP. 548 549 // The output: a list of pairs of PR_TokenEnd instances 550 var tokenEnds = []; 551 552 var state = 0; // FSM state variable 553 var k = 0; // position in absolute space of the start of the current chunk 554 var tokenStart = -1; // the start of the current token 555 556 // Try to find a closing tag for any open <style> or <script> tags 557 // We can't do this at a later stage because then the following case 558 // would fail: 559 // <script>document.writeln('<!--');</script> 560 561 // We use tokenChars[:tokenCharsI] to accumulate the tag name so that we 562 // can check whether to enter into a no scripting section when the tag ends. 563 var tokenChars = new Array(12); 564 var tokenCharsI = 0; 565 // if non null, the tag prefix that we need to see to break out. 566 var endScriptTag = null; 567 var decodeHelper = new PR_DecodeHelper(); 568 569 for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { 570 var chunk = chunks[ci]; 571 if (PR_PLAIN != chunk.style) { 572 k += chunk.token.length; 573 continue; 574 } 575 var s = chunk.token; 576 var pos = 0; // the position past the last character processed so far in s 577 578 for (var i = 0, n = s.length; i < n; /* i = next at bottom */) { 579 decodeHelper.decode(s, i); 580 var ch = decodeHelper.ch; 581 var next = decodeHelper.next; 582 583 var tokenStyle = null; 584 switch (state) { 585 case 0: 586 if ('<' == ch) { state = 1; } 587 if (ch.match(/[\r\n]/)) { state = 13; } 588 break; 589 case 1: 590 tokenCharsI = 0; 591 if ('/' == ch) { // only consider close tags if we're in script/style 592 state = 7; 593 } else if (null == endScriptTag) { 594 if ('!' == ch) { 595 state = 2; 596 } else if (PR_isWordChar(ch)) { 597 state = 8; 598 } else if ('?' == ch) { 599 state = 9; 600 } else if ('%' == ch) { 601 state = 11; 602 } else if ('<' != ch) { 603 state = 0; 604 } 605 } else if ('<' != ch) { 606 state = 0; 607 } 608 break; 609 case 2: 610 if ('-' == ch) { 611 state = 4; 612 } else if (PR_isWordChar(ch)) { 613 state = 3; 614 } else if ('<' == ch) { 615 state = 1; 616 } else { 617 state = 0; 618 } 619 break; 620 case 3: 621 if ('>' == ch) { 622 state = 0; 623 tokenStyle = PR_DECLARATION; 624 } 625 break; 626 case 4: 627 if ('-' == ch) { state = 5; } 628 break; 629 case 5: 630 if ('-' == ch) { state = 6; } 631 break; 632 case 6: 633 if ('>' == ch) { 634 state = 0; 635 tokenStyle = PR_COMMENT; 636 } else if ('-' == ch) { 637 state = 6; 638 } else { 639 state = 4; 640 } 641 break; 642 case 7: 643 if (PR_isWordChar(ch)) { 644 state = 8; 645 } else if ('<' == ch) { 646 state = 1; 647 } else { 648 state = 0; 649 } 650 break; 651 case 8: 652 if ('>' == ch) { 653 state = 0; 654 tokenStyle = PR_TAG; 655 } 656 break; 657 case 9: 658 if ('?' == ch) { state = 10; } 659 break; 660 case 10: 661 if ('>' == ch) { 662 state = 0; 663 tokenStyle = PR_SOURCE; 664 } else if ('?' != ch) { 665 state = 9; 666 } 667 break; 668 case 11: 669 if ('%' == ch) { state = 12; } 670 break; 671 case 12: 672 if ('>' == ch) { 673 state = 0; 674 tokenStyle = PR_SOURCE; 675 } else if ('%' != ch) { 676 state = 11; 677 } 678 break; 679 case 13: 680 tokenCharsI = 0; 681 if (!ch.match(/[\r\n]/)) { 682 state = 0; 683 tokenStyle = PR_NL; 684 next = i; 685 } 686 break; 687 } 688 689 if (tokenCharsI < tokenChars.length) { 690 tokenChars[tokenCharsI++] = ch.toLowerCase(); 691 } 692 if (1 == state || 13 == state) { tokenStart = k + i; } 693 i = next; 694 if (tokenStyle != null) { 695 if (null != tokenStyle) { 696 if (endScriptTag) { 697 if (PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag)) { 698 endScriptTag = null; 699 } 700 } else { 701 if (PR_prefixMatch(tokenChars, tokenCharsI, 'script')) { 702 endScriptTag = '/script'; 703 } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'style')) { 704 endScriptTag = '/style'; 705 } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'xmp')) { 706 endScriptTag = '/xmp'; 707 } 708 } 709 // disallow the tag if endScriptTag is set and this was not an open 710 // tag. 711 if (endScriptTag && tokenCharsI && '/' == tokenChars[0]) { 712 tokenStyle = null; 713 } 714 } 715 if (null != tokenStyle) { 716// PR_splitNonStringNonCommentToken(tokenStart,tokenEnds); 717// alert(tokenStart+" "+tokenStyle); 718 tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN)); 719// alert(tokenEnds) 720// alert(tokenStyle+"\n"+tokenStart+"\n"+tokenEnds) 721 tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle)); 722 } 723 } 724 } 725 k += chunk.token.length; 726 } 727 tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN)); 728 729 return tokenEnds; 730} 731 732/** splits the given string into comment, string, and "other" tokens. 733 * @return {Array} of PR_Tokens with style in 734 * (PR_STRING, PR_COMMENT, PR_PLAIN, null) 735 * The result array may contain spurious zero length tokens. Ignore them. 736 * 737 * @private 738 */ 739function PR_splitStringAndCommentTokens(chunks) { 740 // a state machine to split out comments, strings, and other stuff 741 var tokenEnds = []; // positions of ends of tokens in absolute space 742 var state = 0; // FSM state variable 743 var delim = -1; // string delimiter 744 var k = 0; // absolute position of beginning of current chunk 745 var lookBehind = []; // the last 16 characters processed collapsing space 746 var lastCh = ''; 747 748 for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { 749 var chunk = chunks[ci]; 750 var s = chunk.token; 751 if (PR_PLAIN == chunk.style) { 752 var decodeHelper = new PR_DecodeHelper(); 753 var last = -1; 754 var next; 755 for (var i = 0, n = s.length; i < n; last = i, i = next) { 756 decodeHelper.decode(s, i); 757 var ch = decodeHelper.ch; 758 next = decodeHelper.next; 759 if (0 == state) { 760 if (ch == '"' || ch == '\'' || ch == '`') { 761 tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); 762 state = 1; 763 delim = ch; 764 } else if (ch == '/') { 765 state = 3; 766 } else if (ch == '#') { 767 state = 4; 768 tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); 769 } 770 } else if (1 == state) { 771 if (ch == delim) { 772 state = 0; 773 tokenEnds.push(new PR_TokenEnd(k + next, '/'==ch?PR_REGEX:PR_STRING)); 774 } else if (ch == '\\') { 775 state = 2; 776 } 777 } else if (2 == state) { 778 state = 1; 779 } else if (3 == state) { 780 if (ch == '/') { 781 state = 4; 782 tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); 783 } else if (ch == '*') { 784 state = 5; 785 tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); 786 } else { 787 // check the last token and see if we should treat this as the start 788 // of a regular expression literal. 789 if ((!lookBehind.length || 790 REGEXP_PRECEDER_PATTERN.test(lookBehind.join('')))) { 791 // treat regular expression as a string with delimiter / 792 state = 1; 793 delim = '/'; 794 tokenEnds.push(new PR_TokenEnd(k + last, PR_PLAIN)); 795 } else { 796 state = 0; 797 // next loop will reenter state 0 without same value of i, so 798 // ch will be reconsidered as start of new token. 799 next = i; 800 continue; 801 } 802 } 803 } else if (4 == state) { 804 if (ch == '\r' || ch == '\n') { 805 state = 0; 806 tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT)); 807 } 808 } else if (5 == state) { 809 if (ch == '*') { 810 state = 6; 811 } 812 } else if (6 == state) { 813 if (ch == '/') { 814 state = 0; 815 tokenEnds.push(new PR_TokenEnd(k + next, PR_COMMENT)); 816 continue; // skip lookbehind 817 } else if (ch != '*') { 818 state = 5; 819 } 820 } 821 822 // push char on lookbehind if it's not a comment token. Don't 823 // waste space with lots of space ; just leave enough to indicate 824 // boundaries. 825 if (3 > state || state > 6) { 826 var isSpace = PR_isSpaceChar(ch); 827 if (!(lastCh === ' ' && isSpace)) { 828 if (lookBehind.length > 16) { lookBehind.shift(); } 829 lastCh = isSpace ? ' ' : ch; 830 lookBehind.push(lastCh); 831 } 832 } 833 } 834 } 835 k += s.length; 836 } 837 var endTokenType; 838 switch (state) { 839 case 1: case 2: 840 endTokenType = PR_STRING; 841 break; 842 case 4: case 5: case 6: 843 endTokenType = PR_COMMENT; 844 break; 845 default: 846 endTokenType = PR_PLAIN; 847 break; 848 } 849 // handle unclosed token which can legally happen for line comments (state 4) 850 tokenEnds.push(new PR_TokenEnd(k, endTokenType)); // a token ends at the end 851 852 return PR_splitChunks(chunks, tokenEnds); 853} 854 855/** used by lexSource to split a non string, non comment token. 856 * @private 857 */ 858function PR_splitNonStringNonCommentToken(s, outlist) { 859 var pos = 0; 860 var state = 0; 861 862 var decodeHelper = new PR_DecodeHelper(); 863 var next; 864 for (var i = 0; i <= s.length; i = next) { 865 if (i == s.length) { 866 // nstate will not be equal to state, so it will append the token 867 nstate = -2; 868 next = i + 1; 869 } else { 870 decodeHelper.decode(s, i); 871 next = decodeHelper.next; 872 var ch = decodeHelper.ch; 873 874 // the next state. 875 // if set to -1 then it will cause a reentry to state 0 without consuming 876 // another character. 877 var nstate = state; 878 879 switch (state) { 880 case 0: // whitespace state 881 if (PR_isIdentifierStart(ch)) { 882 nstate = 1; 883 } else if (PR_isDigitChar(ch)) { 884 nstate = 2; 885 } else if (ch.match(/[\r\n]/)) { 886 nstate = 3; 887 } else if (!PR_isSpaceChar(ch)) { 888 nstate = 3; 889 } 890 if (nstate && pos < i) { 891 var t = s.substring(pos, i); 892 outlist.push(new PR_Token(t, PR_PLAIN)); 893 pos = i; 894 } 895 break; 896 case 1: // identifier state 897 if (!PR_isIdentifierPart(ch)) { 898 nstate = -1; 899 } 900 break; 901 case 2: // number literal state 902 // handle numeric literals like 903 // 0x7f 300UL 100_000 904 905 // this does not treat floating point values as a single literal 906 // 0.1 and 3e-6 907 // are each split into multiple tokens 908 if (!(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_')) { 909 nstate = -1; 910 } 911 break; 912 case 3: // punctuation state 913 if (PR_isIdentifierStart(ch) || PR_isDigitChar(ch) || 914 PR_isSpaceChar(ch)) { 915 nstate = -1; 916 } 917 break; 918 } 919 } 920 921 if (nstate != state) { 922 if (nstate < 0) { 923 if (i > pos) { 924 var t = s.substring(pos, i); 925 var wordDecodeHelper = new PR_DecodeHelper(); 926 wordDecodeHelper.decode(t, 0); 927 var ch0 = wordDecodeHelper.ch; 928 var isSingleCharacter = wordDecodeHelper.next == t.length; 929 var style; 930 if (PR_isIdentifierStart(ch0)) { 931 if (PR_keywords[t]) { 932 style = PR_KEYWORD; 933 } else if (ch0 === '@') { 934 style = PR_LITERAL; 935 } else { 936 // Treat any word that starts with an uppercase character and 937 // contains at least one lowercase character as a type, or 938 // ends with _t. 939 // This works perfectly for Java, pretty well for C++, and 940 // passably for Python. The _t catches C structs. 941 var isType = false; 942 if (ch0 >= 'A' && ch0 <= 'Z') { 943 for (var j = wordDecodeHelper.next; 944 j < t.length; j = wordDecodeHelper.next) { 945 wordDecodeHelper.decode(t, j); 946 var ch1 = wordDecodeHelper.ch; 947 if (ch1 >= 'a' && ch1 <= 'z') { 948 isType = true; 949 break; 950 } 951 } 952 if (!isType && !isSingleCharacter && 953 t.substring(t.length - 2) == '_t') { 954 isType = true; 955 } 956 } 957 style = isType ? PR_TYPE 958 : t==t.toUpperCase()?PR_CONSTANT 959 :PR_PLAIN; 960 } 961 } else if (PR_isDigitChar(ch0)) { 962 style = PR_LITERAL; 963 } else if (!PR_isSpaceChar(ch0)) { 964 style = PR_PUNCTUATION; 965 } else if (ch0.match(/[\r\n]/)) { 966 style = PR_NL; 967 } else { 968 style = PR_PLAIN; 969 } 970 pos = i; 971 outlist.push(new PR_Token(t, style)); 972 } 973 974 state = 0; 975 if (nstate == -1) { 976 // don't increment. This allows us to use state 0 to redispatch based 977 // on the current character. 978 next = i; 979 continue; 980 } 981 } 982 state = nstate; 983 } 984 } 985 986} 987 988/** split a group of chunks of markup. 989 * @private 990 */ 991function PR_tokenizeMarkup(chunks) { 992 if (!(chunks && chunks.length)) { return chunks; } 993 994 var tokenEnds = PR_splitMarkup(chunks); 995 return PR_splitChunks(chunks, tokenEnds); 996} 997 998/** split tags attributes and their values out from the tag name, and 999 * recursively lex source chunks. 1000 * @private 1001 */ 1002function PR_splitTagAttributes(tokens) { 1003 var tokensOut = []; 1004 var state = 0; 1005 var stateStyle = PR_TAG; 1006 var delim = null; // attribute delimiter for quoted value state. 1007 var decodeHelper = new PR_DecodeHelper(); 1008 for (var ci = 0; ci < tokens.length; ++ci) { 1009 var tok = tokens[ci]; 1010 if (PR_TAG == tok.style) { 1011 var s = tok.token; 1012 var start = 0; 1013 for (var i = 0; i < s.length; /* i = next at bottom */) { 1014 decodeHelper.decode(s, i); 1015 var ch = decodeHelper.ch; 1016 var next = decodeHelper.next; 1017 1018 var emitEnd = null; // null or position of end of chunk to emit. 1019 var nextStyle = null; // null or next value of stateStyle 1020 if (ch == '>') { 1021 if (PR_TAG != stateStyle) { 1022 emitEnd = i; 1023 nextStyle = PR_TAG; 1024 } 1025 } else { 1026 switch (state) { 1027 case 0: 1028 if ('<' == ch) { state = 1; } 1029 break; 1030 case 1: 1031 if (PR_isSpaceChar(ch)) { state = 2; } 1032 break; 1033 case 2: 1034 if (!PR_isSpaceChar(ch)) { 1035 nextStyle = PR_ATTRIB_NAME; 1036 emitEnd = i; 1037 state = 3; 1038 } 1039 break; 1040 case 3: 1041 if ('=' == ch) { 1042 emitEnd = i; 1043 nextStyle = PR_TAG; 1044 state = 5; 1045 } else if (PR_isSpaceChar(ch)) { 1046 emitEnd = i; 1047 nextStyle = PR_TAG; 1048 state = 4; 1049 } 1050 break; 1051 case 4: 1052 if ('=' == ch) { 1053 state = 5; 1054 } else if (!PR_isSpaceChar(ch)) { 1055 emitEnd = i; 1056 nextStyle = PR_ATTRIB_NAME; 1057 state = 3; 1058 } 1059 break; 1060 case 5: 1061 if ('"' == ch || '\'' == ch) { 1062 emitEnd = i; 1063 nextStyle = PR_ATTRIB_VALUE; 1064 state = 6; 1065 delim = ch; 1066 } else if (!PR_isSpaceChar(ch)) { 1067 emitEnd = i; 1068 nextStyle = PR_ATTRIB_VALUE; 1069 state = 7; 1070 } 1071 break; 1072 case 6: 1073 if (ch == delim) { 1074 emitEnd = next; 1075 nextStyle = PR_TAG; 1076 state = 2; 1077 } 1078 break; 1079 case 7: 1080 if (PR_isSpaceChar(ch)) { 1081 emitEnd = i; 1082 nextStyle = PR_TAG; 1083 state = 2; 1084 } 1085 break; 1086 } 1087 } 1088 if (emitEnd) { 1089 if (emitEnd > start) { 1090 tokensOut.push( 1091 new PR_Token(s.substring(start, emitEnd), stateStyle)); 1092 start = emitEnd; 1093 } 1094 stateStyle = nextStyle; 1095 } 1096 i = next; 1097 } 1098 if (s.length > start) { 1099 tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle)); 1100 } 1101 } else { 1102 if (tok.style) { 1103 state = 0; 1104 stateStyle = PR_TAG; 1105 } 1106 tokensOut.push(tok); 1107 } 1108 } 1109 return tokensOut; 1110} 1111 1112/** identify regions of markup that are really source code, and recursivley 1113 * lex them. 1114 * @private 1115 */ 1116function PR_splitSourceNodes(tokens) { 1117 var tokensOut = []; 1118 // when we see a <script> tag, store '/' here so that we know to end the 1119 // source processing 1120 var endScriptTag = null; 1121 var decodeHelper = new PR_DecodeHelper(); 1122 1123 var sourceChunks = null; 1124 1125 for (var ci = 0, nc = tokens.length; /* break below */; ++ci) { 1126 var tok; 1127 1128 if (ci < nc) { 1129 tok = tokens[ci]; 1130 if (null == tok.style) { 1131 tokens.push(tok); 1132 continue; 1133 } 1134 } else if (!endScriptTag) { 1135 break; 1136 } else { 1137 // else pretend there's an end tag so we can gracefully handle 1138 // unclosed source blocks 1139 tok = new PR_Token('', null); 1140 } 1141 1142 var s = tok.token; 1143 1144 if (null == endScriptTag) { 1145 if (PR_SOURCE == tok.style) { 1146 // split off any starting and trailing <?, <% 1147 if ('<' == decodeHelper.decode(s, 0)) { 1148 decodeHelper.decode(s, decodeHelper.next); 1149 if ('%' == decodeHelper.ch || '?' == decodeHelper.ch) { 1150 endScriptTag = decodeHelper.ch; 1151 tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next), 1152 PR_TAG)); 1153 s = s.substring(decodeHelper.next, s.length); 1154 } 1155 } 1156 } else if (PR_TAG == tok.style) { 1157 if ('<' == decodeHelper.decode(s, 0) && 1158 '/' != s.charAt(decodeHelper.next)) { 1159 var tagContent = s.substring(decodeHelper.next).toLowerCase(); 1160 // FIXME(msamuel): this does not mirror exactly the code in 1161 // in PR_splitMarkup that defers splitting tags inside script and 1162 // style blocks. 1163 if (PR_startsWith(tagContent, 'script') || 1164 PR_startsWith(tagContent, 'style') || 1165 PR_startsWith(tagContent, 'xmp')) { 1166 endScriptTag = '/'; 1167 } 1168 } 1169 } 1170 } 1171 1172 if (null != endScriptTag) { 1173 var endTok = null; 1174 if (PR_SOURCE == tok.style) { 1175 if (endScriptTag == '%' || endScriptTag == '?') { 1176 var pos = s.lastIndexOf(endScriptTag); 1177 if (pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) && 1178 s.length == decodeHelper.next) { 1179 endTok = new PR_Token(s.substring(pos, s.length), PR_TAG); 1180 s = s.substring(0, pos); 1181 } 1182 } 1183 if (null == sourceChunks) { sourceChunks = []; } 1184 sourceChunks.push(new PR_Token(s, PR_PLAIN)); 1185 } else if (PR_PLAIN == tok.style) { 1186 if (null == sourceChunks) { sourceChunks = []; } 1187 sourceChunks.push(tok); 1188 } else if (PR_TAG == tok.style) { 1189 // if it starts with </ then it must be the end tag. 1190 if ('<' == decodeHelper.decode(tok.token, 0) && 1191 tok.token.length > decodeHelper.next && 1192 '/' == decodeHelper.decode(tok.token, decodeHelper.next)) { 1193 endTok = tok; 1194 } else { 1195 tokensOut.push(tok); 1196 } 1197 } else if (ci >= nc) { 1198 // force the token to close 1199 endTok = tok; 1200 } else { 1201 if (sourceChunks) { 1202 sourceChunks.push(tok); 1203 } else { 1204 // push remaining tag and attribute tokens from the opening tag 1205 tokensOut.push(tok); 1206 } 1207 } 1208 if (endTok) { 1209 if (sourceChunks) { 1210 var sourceTokens = PR_lexSource(sourceChunks); 1211 tokensOut.push(new PR_Token('<span class=embsrc>', null)); 1212 for (var si = 0, ns = sourceTokens.length; si < ns; ++si) { 1213 tokensOut.push(sourceTokens[si]); 1214 } 1215 tokensOut.push(new PR_Token('</span>', null)); 1216 sourceChunks = null; 1217 } 1218 if (endTok.token) { tokensOut.push(endTok); } 1219 endScriptTag = null; 1220 } 1221 } else { 1222 tokensOut.push(tok); 1223 } 1224 } 1225 return tokensOut; 1226} 1227 1228/** splits the quotes from an attribute value. 1229 * ['"foo"'] -> ['"', 'foo', '"'] 1230 * @private 1231 */ 1232function PR_splitAttributeQuotes(tokens) { 1233 var firstPlain = null, lastPlain = null; 1234 for (var i = 0; i < tokens.length; ++i) { 1235 if (PR_PLAIN == tokens[i].style) { 1236 firstPlain = i; 1237 break; 1238 } 1239 } 1240 for (var i = tokens.length; --i >= 0;) { 1241 if (PR_PLAIN == tokens[i].style) { 1242 lastPlain = i; 1243 break; 1244 } 1245 } 1246 if (null == firstPlain) { return tokens; } 1247 1248 var decodeHelper = new PR_DecodeHelper(); 1249 var fs = tokens[firstPlain].token; 1250 var fc = decodeHelper.decode(fs, 0); 1251 if ('"' != fc && '\'' != fc) { 1252 return tokens; 1253 } 1254 var fpos = decodeHelper.next; 1255 1256 var ls = tokens[lastPlain].token; 1257 var lpos = ls.lastIndexOf('&'); 1258 if (lpos < 0) { lpos = ls.length - 1; } 1259 var lc = decodeHelper.decode(ls, lpos); 1260 if (lc != fc || decodeHelper.next != ls.length) { 1261 lc = null; 1262 lpos = ls.length; 1263 } 1264 1265 var tokensOut = []; 1266 for (var i = 0; i < firstPlain; ++i) { 1267 tokensOut.push(tokens[i]); 1268 } 1269 tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE)); 1270 if (lastPlain == firstPlain) { 1271 tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN)); 1272 } else { 1273 tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN)); 1274 for (var i = firstPlain + 1; i < lastPlain; ++i) { 1275 tokensOut.push(tokens[i]); 1276 } 1277 if (lc) { 1278 tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN)); 1279 } else { 1280 tokens.push(tokens[lastPlain]); 1281 } 1282 } 1283 if (lc) { 1284 tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN)); 1285 } 1286 for (var i = lastPlain + 1; i < tokens.length; ++i) { 1287 tokensOut.push(tokens[i]); 1288 } 1289 return tokensOut; 1290} 1291 1292/** identify attribute values that really contain source code and recursively 1293 * lex them. 1294 * @private 1295 */ 1296function PR_splitSourceAttributes(tokens) { 1297 var tokensOut = []; 1298 1299 var sourceChunks = null; 1300 var inSource = false; 1301 var name = ''; 1302 1303 for (var ci = 0, nc = tokens.length; ci < nc; ++ci) { 1304 var tok = tokens[ci]; 1305 var outList = tokensOut; 1306 if (PR_TAG == tok.style) { 1307 if (inSource) { 1308 inSource = false; 1309 name = ''; 1310 if (sourceChunks) { 1311 tokensOut.push(new PR_Token('<span class=embsrc>', null)); 1312 var sourceTokens = 1313 PR_lexSource(PR_splitAttributeQuotes(sourceChunks)); 1314 for (var si = 0, ns = sourceTokens.length; si < ns; ++si) { 1315 tokensOut.push(sourceTokens[si]); 1316 } 1317 tokensOut.push(new PR_Token('</span>', null)); 1318 sourceChunks = null; 1319 } 1320 } else if (name && tok.token.indexOf('=') >= 0) { 1321 var nameLower = name.toLowerCase(); 1322 if (PR_startsWith(nameLower, 'on') || 'style' == nameLower) { 1323 inSource = true; 1324 } 1325 } else { 1326 name = ''; 1327 } 1328 } else if (PR_ATTRIB_NAME == tok.style) { 1329 name += tok.token; 1330 } else if (PR_ATTRIB_VALUE == tok.style) { 1331 if (inSource) { 1332 if (null == sourceChunks) { sourceChunks = []; } 1333 outList = sourceChunks; 1334 tok = new PR_Token(tok.token, PR_PLAIN); 1335 } 1336 } else { 1337 if (sourceChunks) { 1338 outList = sourceChunks; 1339 } 1340 } 1341 outList.push(tok); 1342 } 1343 return tokensOut; 1344} 1345 1346/** returns a list of PR_Token objects given chunks of source code. 1347 * 1348 * This code treats ", ', and ` as string delimiters, and \ as a string escape. 1349 * It does not recognize perl's qq() style strings. It has no special handling 1350 * for double delimiter escapes as in basic, or tje tripled delimiters used in 1351 * python, but should work on those regardless although in those cases a single 1352 * string literal may be broken up into multiple adjacent string literals. 1353 * 1354 * It recognizes C, C++, and shell style comments. 1355 * 1356 * @param chunks PR_Tokens with style in (null, PR_PLAIN) 1357 */ 1358function PR_lexSource(chunks) { 1359 // split into strings, comments, and other. 1360 // We do this because strings and comments are easily recognizable and can 1361 // contain stuff that looks like other tokens, so we want to mark those early 1362 // so we don't recurse into them. 1363 var tokens = PR_splitStringAndCommentTokens(chunks); 1364 1365 // split non comment|string tokens on whitespace and word boundaries 1366 var tokensOut = []; 1367 for (var i = 0; i < tokens.length; ++i) { 1368 var tok = tokens[i]; 1369 if (PR_PLAIN === tok.style) { 1370 PR_splitNonStringNonCommentToken(tok.token, tokensOut); 1371 continue; 1372 } 1373 tokensOut.push(tok); 1374 } 1375 1376 return tokensOut; 1377} 1378 1379/** returns a list of PR_Token objects given a string of markup. 1380 * 1381 * This code assumes that < tokens are html escaped, but " are not. 1382 * It will do a resonable job with <, but will not recognize an " 1383 * as starting a string. 1384 * 1385 * This code recognizes a number of constructs. 1386 * <!-- ... --> comment 1387 * <!\w ... > declaration 1388 * <\w ... > tag 1389 * </\w ... > tag 1390 * <?...?> embedded source 1391 * &[#\w]...; entity 1392 * 1393 * It does not recognizes %foo; entities. 1394 * 1395 * It will recurse into any <style>, <script>, and on* attributes using 1396 * PR_lexSource. 1397 */ 1398function PR_lexMarkup(chunks) { 1399 // This function works as follows: 1400 // 1) Start by splitting the markup into text and tag chunks 1401 // Input: String s 1402 // Output: List<PR_Token> where style in (PR_PLAIN, null) 1403 // 2) Then split the text chunks further into comments, declarations, 1404 // tags, etc. 1405 // After each split, consider whether the token is the start of an 1406 // embedded source section, i.e. is an open <script> tag. If it is, 1407 // find the corresponding close token, and don't bother to lex in between. 1408 // Input: List<String> 1409 // Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null) 1410 // 3) Finally go over each tag token and split out attribute names and values. 1411 // Input: List<PR_Token> 1412 // Output: List<PR_Token> where style in 1413 // (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null) 1414 var tokensOut = PR_tokenizeMarkup(chunks); 1415 tokensOut = PR_splitTagAttributes(tokensOut); 1416 tokensOut = PR_splitSourceNodes(tokensOut); 1417 tokensOut = PR_splitSourceAttributes(tokensOut); 1418 return tokensOut; 1419} 1420 1421/** 1422 * classify the string as either source or markup and lex appropriately. 1423 * @param {String} html 1424 */ 1425function PR_lexOne(html) { 1426// var chunks = PR_expandTabs(PR_chunkify(html), PR_TAB_WIDTH); 1427 var chunks = PR_chunkify(html); 1428 1429 // treat it as markup if the first non whitespace character is a < and the 1430 // last non-whitespace character is a > 1431 var isMarkup = false; 1432 for (var i = 0; i < chunks.length; ++i) { 1433 if (PR_PLAIN == chunks[i].style) { 1434 if (PR_startsWith(PR_trim(chunks[i].token), '<')) { 1435 for (var j = chunks.length; --j >= 0;) { 1436 if (PR_PLAIN == chunks[j].style) { 1437 isMarkup = PR_endsWith(PR_trim(chunks[j].token), '>'); 1438 break; 1439 } 1440 } 1441 } 1442 break; 1443 } 1444 } 1445 1446 return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks); 1447}