175 lines
6.6 KiB
JavaScript
175 lines
6.6 KiB
JavaScript
|
/* Tokenizer for JavaScript code */
|
||
|
|
||
|
var tokenizeJavaScript = (function() {
|
||
|
// Advance the stream until the given character (not preceded by a
|
||
|
// backslash) is encountered, or the end of the line is reached.
|
||
|
function nextUntilUnescaped(source, end) {
|
||
|
var escaped = false;
|
||
|
while (!source.endOfLine()) {
|
||
|
var next = source.next();
|
||
|
if (next == end && !escaped)
|
||
|
return false;
|
||
|
escaped = !escaped && next == "\\";
|
||
|
}
|
||
|
return escaped;
|
||
|
}
|
||
|
|
||
|
// A map of JavaScript's keywords. The a/b/c keyword distinction is
|
||
|
// very rough, but it gives the parser enough information to parse
|
||
|
// correct code correctly (we don't care that much how we parse
|
||
|
// incorrect code). The style information included in these objects
|
||
|
// is used by the highlighter to pick the correct CSS style for a
|
||
|
// token.
|
||
|
var keywords = function(){
|
||
|
function result(type, style){
|
||
|
return {type: type, style: "js-" + style};
|
||
|
}
|
||
|
// keywords that take a parenthised expression, and then a
|
||
|
// statement (if)
|
||
|
var keywordA = result("keyword a", "keyword");
|
||
|
// keywords that take just a statement (else)
|
||
|
var keywordB = result("keyword b", "keyword");
|
||
|
// keywords that optionally take an expression, and form a
|
||
|
// statement (return)
|
||
|
var keywordC = result("keyword c", "keyword");
|
||
|
var operator = result("operator", "keyword");
|
||
|
var atom = result("atom", "atom");
|
||
|
return {
|
||
|
"if": keywordA, "while": keywordA, "with": keywordA,
|
||
|
"else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
|
||
|
"return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC,
|
||
|
"in": operator, "typeof": operator, "instanceof": operator,
|
||
|
"var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"),
|
||
|
"for": result("for", "keyword"), "switch": result("switch", "keyword"),
|
||
|
"case": result("case", "keyword"), "default": result("default", "keyword"),
|
||
|
"true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom
|
||
|
};
|
||
|
}();
|
||
|
|
||
|
// Some helper regexps
|
||
|
var isOperatorChar = /[+\-*&%=<>!?|]/;
|
||
|
var isHexDigit = /[0-9A-Fa-f]/;
|
||
|
var isWordChar = /[\w\$_]/;
|
||
|
|
||
|
// Wrapper around jsToken that helps maintain parser state (whether
|
||
|
// we are inside of a multi-line comment and whether the next token
|
||
|
// could be a regular expression).
|
||
|
function jsTokenState(inside, regexp) {
|
||
|
return function(source, setState) {
|
||
|
var newInside = inside;
|
||
|
var type = jsToken(inside, regexp, source, function(c) {newInside = c;});
|
||
|
var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
|
||
|
if (newRegexp != regexp || newInside != inside)
|
||
|
setState(jsTokenState(newInside, newRegexp));
|
||
|
return type;
|
||
|
};
|
||
|
}
|
||
|
|
||
|
// The token reader, intended to be used by the tokenizer from
|
||
|
// tokenize.js (through jsTokenState). Advances the source stream
|
||
|
// over a token, and returns an object containing the type and style
|
||
|
// of that token.
|
||
|
function jsToken(inside, regexp, source, setInside) {
|
||
|
function readHexNumber(){
|
||
|
source.next(); // skip the 'x'
|
||
|
source.nextWhileMatches(isHexDigit);
|
||
|
return {type: "number", style: "js-atom"};
|
||
|
}
|
||
|
|
||
|
function readNumber() {
|
||
|
source.nextWhileMatches(/[0-9]/);
|
||
|
if (source.equals(".")){
|
||
|
source.next();
|
||
|
source.nextWhileMatches(/[0-9]/);
|
||
|
}
|
||
|
if (source.equals("e") || source.equals("E")){
|
||
|
source.next();
|
||
|
if (source.equals("-"))
|
||
|
source.next();
|
||
|
source.nextWhileMatches(/[0-9]/);
|
||
|
}
|
||
|
return {type: "number", style: "js-atom"};
|
||
|
}
|
||
|
// Read a word, look it up in keywords. If not found, it is a
|
||
|
// variable, otherwise it is a keyword of the type found.
|
||
|
function readWord() {
|
||
|
source.nextWhileMatches(isWordChar);
|
||
|
var word = source.get();
|
||
|
var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
|
||
|
return known ? {type: known.type, style: known.style, content: word} :
|
||
|
{type: "variable", style: "js-variable", content: word};
|
||
|
}
|
||
|
function readRegexp() {
|
||
|
nextUntilUnescaped(source, "/");
|
||
|
source.nextWhileMatches(/[gimy]/); // 'y' is "sticky" option in Mozilla
|
||
|
return {type: "regexp", style: "js-string"};
|
||
|
}
|
||
|
// Mutli-line comments are tricky. We want to return the newlines
|
||
|
// embedded in them as regular newline tokens, and then continue
|
||
|
// returning a comment token for every line of the comment. So
|
||
|
// some state has to be saved (inside) to indicate whether we are
|
||
|
// inside a /* */ sequence.
|
||
|
function readMultilineComment(start){
|
||
|
var newInside = "/*";
|
||
|
var maybeEnd = (start == "*");
|
||
|
while (true) {
|
||
|
if (source.endOfLine())
|
||
|
break;
|
||
|
var next = source.next();
|
||
|
if (next == "/" && maybeEnd){
|
||
|
newInside = null;
|
||
|
break;
|
||
|
}
|
||
|
maybeEnd = (next == "*");
|
||
|
}
|
||
|
setInside(newInside);
|
||
|
return {type: "comment", style: "js-comment"};
|
||
|
}
|
||
|
function readOperator() {
|
||
|
source.nextWhileMatches(isOperatorChar);
|
||
|
return {type: "operator", style: "js-operator"};
|
||
|
}
|
||
|
function readString(quote) {
|
||
|
var endBackSlash = nextUntilUnescaped(source, quote);
|
||
|
setInside(endBackSlash ? quote : null);
|
||
|
return {type: "string", style: "js-string"};
|
||
|
}
|
||
|
|
||
|
// Fetch the next token. Dispatches on first character in the
|
||
|
// stream, or first two characters when the first is a slash.
|
||
|
if (inside == "\"" || inside == "'")
|
||
|
return readString(inside);
|
||
|
var ch = source.next();
|
||
|
if (inside == "/*")
|
||
|
return readMultilineComment(ch);
|
||
|
else if (ch == "\"" || ch == "'")
|
||
|
return readString(ch);
|
||
|
// with punctuation, the type of the token is the symbol itself
|
||
|
else if (/[\[\]{}\(\),;\:\.]/.test(ch))
|
||
|
return {type: ch, style: "js-punctuation"};
|
||
|
else if (ch == "0" && (source.equals("x") || source.equals("X")))
|
||
|
return readHexNumber();
|
||
|
else if (/[0-9]/.test(ch))
|
||
|
return readNumber();
|
||
|
else if (ch == "/"){
|
||
|
if (source.equals("*"))
|
||
|
{ source.next(); return readMultilineComment(ch); }
|
||
|
else if (source.equals("/"))
|
||
|
{ nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};}
|
||
|
else if (regexp)
|
||
|
return readRegexp();
|
||
|
else
|
||
|
return readOperator();
|
||
|
}
|
||
|
else if (isOperatorChar.test(ch))
|
||
|
return readOperator();
|
||
|
else
|
||
|
return readWord();
|
||
|
}
|
||
|
|
||
|
// The external interface to the tokenizer.
|
||
|
return function(source, startState) {
|
||
|
return tokenizer(source, startState || jsTokenState(false, true));
|
||
|
};
|
||
|
})();
|