scanner.js 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. 'use strict';
  2. var TokenType = require('./const.js').TokenType;
  3. var TAB = 9;
  4. var N = 10;
  5. var F = 12;
  6. var R = 13;
  7. var SPACE = 32;
  8. var DOUBLE_QUOTE = 34;
  9. var QUOTE = 39;
  10. var RIGHT_PARENTHESIS = 41;
  11. var STAR = 42;
  12. var SLASH = 47;
  13. var BACK_SLASH = 92;
  14. var UNDERSCORE = 95;
  15. var LEFT_CURLY_BRACE = 123;
  16. var RIGHT_CURLY_BRACE = 125;
  17. var WHITESPACE = 1;
  18. var PUNCTUATOR = 2;
  19. var DIGIT = 3;
  20. var STRING = 4;
  21. var PUNCTUATION = {
  22. 9: TokenType.Tab, // '\t'
  23. 10: TokenType.Newline, // '\n'
  24. 13: TokenType.Newline, // '\r'
  25. 32: TokenType.Space, // ' '
  26. 33: TokenType.ExclamationMark, // '!'
  27. 34: TokenType.QuotationMark, // '"'
  28. 35: TokenType.NumberSign, // '#'
  29. 36: TokenType.DollarSign, // '$'
  30. 37: TokenType.PercentSign, // '%'
  31. 38: TokenType.Ampersand, // '&'
  32. 39: TokenType.Apostrophe, // '\''
  33. 40: TokenType.LeftParenthesis, // '('
  34. 41: TokenType.RightParenthesis, // ')'
  35. 42: TokenType.Asterisk, // '*'
  36. 43: TokenType.PlusSign, // '+'
  37. 44: TokenType.Comma, // ','
  38. 45: TokenType.HyphenMinus, // '-'
  39. 46: TokenType.FullStop, // '.'
  40. 47: TokenType.Solidus, // '/'
  41. 58: TokenType.Colon, // ':'
  42. 59: TokenType.Semicolon, // ';'
  43. 60: TokenType.LessThanSign, // '<'
  44. 61: TokenType.EqualsSign, // '='
  45. 62: TokenType.GreaterThanSign, // '>'
  46. 63: TokenType.QuestionMark, // '?'
  47. 64: TokenType.CommercialAt, // '@'
  48. 91: TokenType.LeftSquareBracket, // '['
  49. 93: TokenType.RightSquareBracket, // ']'
  50. 94: TokenType.CircumflexAccent, // '^'
  51. 95: TokenType.LowLine, // '_'
  52. 123: TokenType.LeftCurlyBracket, // '{'
  53. 124: TokenType.VerticalLine, // '|'
  54. 125: TokenType.RightCurlyBracket, // '}'
  55. 126: TokenType.Tilde // '~'
  56. };
  57. var SYMBOL_CATEGORY_LENGTH = Math.max.apply(null, Object.keys(PUNCTUATION)) + 1;
  58. var SYMBOL_CATEGORY = new Uint32Array(SYMBOL_CATEGORY_LENGTH);
  59. var IS_PUNCTUATOR = new Uint32Array(SYMBOL_CATEGORY_LENGTH);
  60. // fill categories
  61. Object.keys(PUNCTUATION).forEach(function(key) {
  62. SYMBOL_CATEGORY[Number(key)] = PUNCTUATOR;
  63. IS_PUNCTUATOR[Number(key)] = PUNCTUATOR;
  64. }, SYMBOL_CATEGORY);
  65. // don't treat as punctuator
  66. IS_PUNCTUATOR[UNDERSCORE] = 0;
  67. for (var i = 48; i <= 57; i++) {
  68. SYMBOL_CATEGORY[i] = DIGIT;
  69. }
  70. SYMBOL_CATEGORY[SPACE] = WHITESPACE;
  71. SYMBOL_CATEGORY[TAB] = WHITESPACE;
  72. SYMBOL_CATEGORY[N] = WHITESPACE;
  73. SYMBOL_CATEGORY[R] = WHITESPACE;
  74. SYMBOL_CATEGORY[F] = WHITESPACE;
  75. SYMBOL_CATEGORY[QUOTE] = STRING;
  76. SYMBOL_CATEGORY[DOUBLE_QUOTE] = STRING;
  77. //
  78. // scanner
  79. //
  80. var Scanner = function(source, initBlockMode, initLine, initColumn) {
  81. this.source = source;
  82. this.pos = source.charCodeAt(0) === 0xFEFF ? 1 : 0;
  83. this.eof = this.pos === this.source.length;
  84. this.line = typeof initLine === 'undefined' ? 1 : initLine;
  85. this.lineStartPos = typeof initColumn === 'undefined' ? -1 : -initColumn;
  86. this.minBlockMode = initBlockMode ? 1 : 0;
  87. this.blockMode = this.minBlockMode;
  88. this.urlMode = false;
  89. this.prevToken = null;
  90. this.token = null;
  91. this.buffer = [];
  92. };
  93. Scanner.prototype = {
  94. lookup: function(offset) {
  95. if (offset === 0) {
  96. return this.token;
  97. }
  98. for (var i = this.buffer.length; !this.eof && i < offset; i++) {
  99. this.buffer.push(this.getToken());
  100. }
  101. return offset <= this.buffer.length ? this.buffer[offset - 1] : null;
  102. },
  103. lookupType: function(offset, type) {
  104. var token = this.lookup(offset);
  105. return token !== null && token.type === type;
  106. },
  107. next: function() {
  108. var newToken = null;
  109. if (this.buffer.length !== 0) {
  110. newToken = this.buffer.shift();
  111. } else if (!this.eof) {
  112. newToken = this.getToken();
  113. }
  114. this.prevToken = this.token;
  115. this.token = newToken;
  116. return newToken;
  117. },
  118. tokenize: function() {
  119. var tokens = [];
  120. for (; this.pos < this.source.length; this.pos++) {
  121. tokens.push(this.getToken());
  122. }
  123. return tokens;
  124. },
  125. getToken: function() {
  126. var code = this.source.charCodeAt(this.pos);
  127. var line = this.line;
  128. var column = this.pos - this.lineStartPos;
  129. var offset = this.pos;
  130. var next;
  131. var type;
  132. var value;
  133. switch (code < SYMBOL_CATEGORY_LENGTH ? SYMBOL_CATEGORY[code] : 0) {
  134. case DIGIT:
  135. type = TokenType.DecimalNumber;
  136. value = this.readDecimalNumber();
  137. break;
  138. case STRING:
  139. type = TokenType.String;
  140. value = this.readString(code);
  141. break;
  142. case WHITESPACE:
  143. type = TokenType.Space;
  144. value = this.readSpaces();
  145. break;
  146. case PUNCTUATOR:
  147. if (code === SLASH) {
  148. next = this.pos + 1 < this.source.length ? this.source.charCodeAt(this.pos + 1) : 0;
  149. if (next === STAR) { // /*
  150. type = TokenType.Comment;
  151. value = this.readComment();
  152. break;
  153. } else if (next === SLASH && !this.urlMode) { // //
  154. if (this.blockMode > 0) {
  155. var skip = 2;
  156. while (this.source.charCodeAt(this.pos + 2) === SLASH) {
  157. skip++;
  158. }
  159. type = TokenType.Identifier;
  160. value = this.readIdentifier(skip);
  161. this.urlMode = this.urlMode || value === 'url';
  162. } else {
  163. type = TokenType.Unknown;
  164. value = this.readUnknown();
  165. }
  166. break;
  167. }
  168. }
  169. type = PUNCTUATION[code];
  170. value = String.fromCharCode(code);
  171. this.pos++;
  172. if (code === RIGHT_PARENTHESIS) {
  173. this.urlMode = false;
  174. } else if (code === LEFT_CURLY_BRACE) {
  175. this.blockMode++;
  176. } else if (code === RIGHT_CURLY_BRACE) {
  177. if (this.blockMode > this.minBlockMode) {
  178. this.blockMode--;
  179. }
  180. }
  181. break;
  182. default:
  183. type = TokenType.Identifier;
  184. value = this.readIdentifier(0);
  185. this.urlMode = this.urlMode || value === 'url';
  186. }
  187. this.eof = this.pos === this.source.length;
  188. return {
  189. type: type,
  190. value: value,
  191. offset: offset,
  192. line: line,
  193. column: column
  194. };
  195. },
  196. isNewline: function(code) {
  197. if (code === N || code === F || code === R) {
  198. if (code === R && this.pos + 1 < this.source.length && this.source.charCodeAt(this.pos + 1) === N) {
  199. this.pos++;
  200. }
  201. this.line++;
  202. this.lineStartPos = this.pos;
  203. return true;
  204. }
  205. return false;
  206. },
  207. readSpaces: function() {
  208. var start = this.pos;
  209. for (; this.pos < this.source.length; this.pos++) {
  210. var code = this.source.charCodeAt(this.pos);
  211. if (!this.isNewline(code) && code !== SPACE && code !== TAB) {
  212. break;
  213. }
  214. }
  215. return this.source.substring(start, this.pos);
  216. },
  217. readComment: function() {
  218. var start = this.pos;
  219. for (this.pos += 2; this.pos < this.source.length; this.pos++) {
  220. var code = this.source.charCodeAt(this.pos);
  221. if (code === STAR) { // */
  222. if (this.source.charCodeAt(this.pos + 1) === SLASH) {
  223. this.pos += 2;
  224. break;
  225. }
  226. } else {
  227. this.isNewline(code);
  228. }
  229. }
  230. return this.source.substring(start, this.pos);
  231. },
  232. readUnknown: function() {
  233. var start = this.pos;
  234. for (this.pos += 2; this.pos < this.source.length; this.pos++) {
  235. if (this.isNewline(this.source.charCodeAt(this.pos), this.source)) {
  236. break;
  237. }
  238. }
  239. return this.source.substring(start, this.pos);
  240. },
  241. readString: function(quote) {
  242. var start = this.pos;
  243. var res = '';
  244. for (this.pos++; this.pos < this.source.length; this.pos++) {
  245. var code = this.source.charCodeAt(this.pos);
  246. if (code === BACK_SLASH) {
  247. var end = this.pos++;
  248. if (this.isNewline(this.source.charCodeAt(this.pos), this.source)) {
  249. res += this.source.substring(start, end);
  250. start = this.pos + 1;
  251. }
  252. } else if (code === quote) {
  253. this.pos++;
  254. break;
  255. }
  256. }
  257. return res + this.source.substring(start, this.pos);
  258. },
  259. readDecimalNumber: function() {
  260. var start = this.pos;
  261. var code;
  262. for (this.pos++; this.pos < this.source.length; this.pos++) {
  263. code = this.source.charCodeAt(this.pos);
  264. if (code < 48 || code > 57) { // 0 .. 9
  265. break;
  266. }
  267. }
  268. return this.source.substring(start, this.pos);
  269. },
  270. readIdentifier: function(skip) {
  271. var start = this.pos;
  272. for (this.pos += skip; this.pos < this.source.length; this.pos++) {
  273. var code = this.source.charCodeAt(this.pos);
  274. if (code === BACK_SLASH) {
  275. this.pos++;
  276. // skip escaped unicode sequence that can ends with space
  277. // [0-9a-f]{1,6}(\r\n|[ \n\r\t\f])?
  278. for (var i = 0; i < 7 && this.pos + i < this.source.length; i++) {
  279. code = this.source.charCodeAt(this.pos + i);
  280. if (i !== 6) {
  281. if ((code >= 48 && code <= 57) || // 0 .. 9
  282. (code >= 65 && code <= 70) || // A .. F
  283. (code >= 97 && code <= 102)) { // a .. f
  284. continue;
  285. }
  286. }
  287. if (i > 0) {
  288. this.pos += i - 1;
  289. if (code === SPACE || code === TAB || this.isNewline(code)) {
  290. this.pos++;
  291. }
  292. }
  293. break;
  294. }
  295. } else if (code < SYMBOL_CATEGORY_LENGTH &&
  296. IS_PUNCTUATOR[code] === PUNCTUATOR) {
  297. break;
  298. }
  299. }
  300. return this.source.substring(start, this.pos);
  301. }
  302. };
  303. // warm up tokenizer to elimitate code branches that never execute
  304. // fix soft deoptimizations (insufficient type feedback)
  305. new Scanner('\n\r\r\n\f//""\'\'/**/1a;.{url(a)}').lookup(1e3);
  306. module.exports = Scanner;