Between regex and if-else

2012-07-18

For the curious, I just changed how my parser checks whether the next token is a punctuator (that's pretty much any non-character, non-digit, non-whitespace, sign in our source code). I used to do it with a regex. I thought it was pretty nifty to do so and it seemed to be the better performing way at the time. It isn't anymore.

So the regex is returned by this function:

Code:
function puncrex(){
// note: punctuators should be parsed long to short. regex picks longest first, parser wants that too.
var punc = [
'>>>=',
'===','!==','>>>','<<=','>>=',
'<=','>=','==','!=','\\+\\+','--','<<','>>','\\&\\&','\\|\\|','\\+=','-=','\\*=','%=','\\&=','\\|=','\\^=','\\/=',
'\\{','\\}','\\(','\\)','\\[','\\]','\\.',';',',','<','>','\\+','-','\\*','%','\\|','\\&','\\|','\\^','!','~','\\?',':','=','\\/'
];

return new RegExp('^(?:'+punc.join('|')+')');
}


(Which translates to: /^(?:>>>=|===|!==|>>>|<<=|>>=|<=|>=|==|!=|\+\+|--|<<|>>|\&\&|\|\||\+=|-=|\*=|%=|\&=|\|=|\^=|\/=|\{|\}|\(|\)|\[|\]|\.|;|,|<|>|\+|-|\*|%|\||\&|\||\^|!|~|\?|:|=|\/)/)

The equivalent if-else structure is as follows:

Code:
function checkPunctuator(c){
var input = this.input;
var pos = this.pos;

// >>>=,
// === !== >>> <<= >>=
// <= >= == != ++ -- << >> && || += -= *= %= &= |= ^= /=
// { } ( ) [ ] . ; , < > + - * % | & ^ ! ~ ? : = /

if (c === 0x28) return this.puncToken(1, '(');
if (c === 0x29) return this.puncToken(1, ')');
if (c === 0x7b) return this.puncToken(1, '{');
if (c === 0x7d) return this.puncToken(1, '}');
if (c === 0x5b) return this.puncToken(1, '[');
if (c === 0x5d) return this.puncToken(1, ']');
if (c === 0x2e) return this.puncToken(1, '.');
if (c === 0x3b) return this.puncToken(1, ';');
if (c === 0x2c) return this.puncToken(1, ',');
if (c === 0x3f) return this.puncToken(1, '?');
if (c === 0x3a) return this.puncToken(1, ':');

var d = input.charCodeAt(pos+1);

if (c === 0x3d) {
if (d === 0x3d) {
if (input.charCodeAt(pos+2) === 0x3d) return this.puncToken(3, '===');
return this.puncToken(2, '==');
}
return this.puncToken(1, '=');
}
if (c === 0x3c) {
if (d === 0x3d) return this.puncToken(2, '<=');
if (d === 0x3c) {
if (input.charCodeAt(pos+2) === 0x3d) return this.puncToken(3, '<<=');
return this.puncToken(2, '<<');
}
return this.puncToken(1, '<');
}

if (c === 0x3e) {
if (d === 0x3d) return this.puncToken(2, '>=');
if (d === 0x3e) {
var e = input.charCodeAt(pos+2);
if (e === 0x3d) return this.puncToken(3, '>>=');
if (e === 0x3e) {
if (input.charCodeAt(pos+3) === 0x3d) return this.puncToken(4, '>>>=');
return this.puncToken(3, '>>>');
}
return this.puncToken(2, '>>');
}
return this.puncToken(1, '>');
}

if (c === 0x2b) {
if (d === 0x3d) return this.puncToken(2, '+=');
if (d === 0x2b) return this.puncToken(2, '++');
return this.puncToken(1, '+');
}
if (c === 0x2d) {
if (d === 0x3d) return this.puncToken(2, '-=');
if (d === 0x2d) return this.puncToken(2, '--');
return this.puncToken(1, '-');
}
if (c === 0x2a) {
if (d === 0x3d) return this.puncToken(2, '*=');
return this.puncToken(1, '*');
}
if (c === 0x25) {
if (d === 0x3d) return this.puncToken(2, '%=');
return this.puncToken(1, '%');
}
if (c === 0x7c) {
if (d === 0x3d) return this.puncToken(2, '+=');
if (d === 0x7c) return this.puncToken(2, '+=');
return this.puncToken(1, '|');
}
if (c === 0x26) {
if (d === 0x3d) return this.puncToken(2, '&=');
if (d === 0x26) return this.puncToken(2, '&&');
return this.puncToken(1, '&');
}
if (c === 0x5e) {
if (d === 0x3d) return this.puncToken(2, '^=');
return this.puncToken(1, '^');
}
if (c === 0x21) {
if (d === 0x3d) {
if (input.charCodeAt(pos+2) === 0x3d) return this.puncToken(3, '!==');
return this.puncToken(2, '!=');
}
return this.puncToken(1, '~');
}
if (c === 0x7e) {
if (d === 0x3d) return this.puncToken(2, '~=');
return this.puncToken(1, '~');
}
if (c === 0x2f) {
// cant really be a //, /* or regex because they should have been checked before calling this function
if (d === 0x3d) return this.puncToken(2, '/=');
return this.puncToken(1, '/');
}

return false;
}

I know that's big, but this change shaved between 200ms and 300ms from 960-ish ms when parsing 8mb of js. So I'm now down to about 650ms. That's huge... And I still have to prioritize the order of the ifs by frequency!

Please note that optimization is a tricky business. Regexes are fast but carry a relatively big overhead for their execution. When comparing just a few characters, the overhead is killing. That's probably what happened in the example above. That does not mean that regular expressions are bad for parsing...

Just wanted to share that :)