Skip to content

Instantly share code, notes, and snippets.

@premasagar
Forked from bga/_gzip.js
Created August 11, 2010 17:57

Revisions

  1. @bga bga revised this gist Aug 11, 2010. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion _gzip.js
    Original file line number Diff line number Diff line change
    @@ -75,7 +75,8 @@ window._gzip = (function()
    return t;
    };

    var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';
    //var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';
    var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));eval(s)';

    /**
    @fn pack and wrap to <unpackCode> code <s> using gzip like algo (dictonaty based)
  2. @bga bga created this gist Aug 11, 2010.
    264 changes: 264 additions & 0 deletions _gzip.js
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,264 @@
    window._gzip = (function()
    {
    var _matchCount = function(s, word)
    {
    var n = 0, wordLen = word.length, p = -wordLen;

    while((p = s.indexOf(word, p + wordLen)) > -1)
    ++n;

    return n;
    };

    var _matchCountInDict = function(dict, word)
    {
    var n = 0, i = dict.length;

    while(i--)
    {
    if(dict[i].word.indexOf(word) > -1)
    ++n;
    }

    return n;
    };


    var _escapeForCString = function(s)
    {
    return s.replace(/\\/g, '\\\\').
    replace(/\'/g, '\\\'').
    //replace(/\"/g, '\\\"').
    replace(/\r/g, '\\r').
    replace(/\0/g, '\\x00').
    replace(/\n/g, '\\n');
    };
    var reEscapeableRE = /[\[\]\(\)\{\}\?\!\:\|\+\.\*\'\"]/g;
    var _escapeForRegExp = function(s)
    {
    return s.replace(reEscapeableRE, '\\$&');
    };
    var _escapeForRegExpReplace = function(s)
    {
    return s.replace(/\$/g, '\\$$');
    };

    var _removeEscapebleChars = function(a)
    {
    delete a['^'];
    delete a['\"'];
    delete a['\''];
    delete a['\\'];
    delete a['$'];
    delete a['['];
    delete a[']'];
    delete a['('];
    delete a[')'];
    delete a['{'];
    delete a['}'];
    delete a['?'];
    delete a['!'];
    delete a[':'];
    delete a['|'];
    delete a['+'];
    delete a['`'];
    delete a['.'];
    delete a['*'];
    };

    var _dumpDict = function(dict)
    {
    var t = '';
    var d, i = -1; while((d = dict[++i]))
    t += d.word + '--';

    return t;
    };

    var unpackCode = 'for(var s=\'__CODE__\',d=\'__DICT__\'.split("`"),i=0;v=d[i++];)s=s.replace(RegExp(v[0],"g"),v.slice(1));console.log(s)';

    /**
    @fn pack and wrap to <unpackCode> code <s> using gzip like algo (dictonaty based)
    @param allowNonAcsiiChars {Boolean} {Default = true} true if you allow use 32-256 chars range else 32-127.
    @param minProfit {Number} {Default = 0} threshold in bytes for each replace pair
    @return packed and wrapped to <unpackCode> code
    */
    var _gzip = function(s, allowNonAcsiiChars, minProfit)
    {
    if(allowNonAcsiiChars == null)
    allowNonAcsiiChars = true;
    if(minProfit == null)
    minProfit = 1;

    var availableCharMap = {};

    var i = 31, e = (allowNonAcsiiChars) ? 256 : 127;

    // gen all available char map
    while(++i < e)
    availableCharMap[String.fromCharCode(i)] = 0;

    // and remove some chars which require escaping (in RegExp too)
    _removeEscapebleChars(availableCharMap);

    // reduce availableCharMap - remove chars which used in <s>
    i = -1, e = s.length; while(++i < e)
    {
    delete availableCharMap[s.charAt(i)];
    }

    // convert <allowNonAcsiiChars> to <availableChars>
    var availableChars = [], j = 0;

    for(var i in availableCharMap)
    {
    if(availableCharMap.hasOwnProperty(i))
    availableChars[j++] = i;
    }

    //console.log(availableChars.join('> <'));
    //console.log(availableChars);

    var sLen = s.length;
    var dict = [], dictLen = 0;

    // build <dict>

    i = -1, e = sLen - 2; while(++i < e)
    {
    var c = s.charAt(i);
    var j = i + 1;

    do
    {
    while(++j < e && s.charAt(j) != c)
    ;

    if(j < e && s.charAt(i + 1) == s.charAt(++j))
    {
    var k = i + 1;

    do
    {
    var word = s.slice(i, k + 1);

    //if(!reEscapeableRE.test(word))
    dict[dictLen++] = {word: word, matchCount: _matchCount(s, word)};
    }
    while(++k < sLen && ++j < sLen && s.charAt(k) == s.charAt(j));
    }
    }
    while(j < e);
    }

    // sort <dict> by word
    dict.sort(function(a, b){ return 2*(a.word < b.word) - 1; });

    //console.log(_dumpDict(dict));

    // remove duplicating words
    var d, i = -1, word, j = 0; while((d = dict[++i]))
    {
    if(d.word != word)
    {
    dict[j++] = d;
    word = d.word;
    }
    }

    dict.length = dictLen = j;

    // word will be replaced using RegExp - prepare escaped vestion
    var d, i = -1; while((d = dict[++i]))
    d.escapedWord = _escapeForRegExp(d.word);

    // calc profit of each word in <dict>
    var d, i = -1; while((d = dict[++i]))
    {
    var escapedWord = _escapeForRegExpReplace(d.word);

    d.profit = d.matchCount*(escapedWord.length - 1) - escapedWord.length - 1 - 1;
    }

    // remove words with profit < <minProfit>
    var d, i = -1, j = -1; while((d = dict[++i]))
    {
    if(d.profit > minProfit)
    dict[++j] = d;
    }

    dict.length = dictLen = j + 1;

    //console.log(_dumpDict(dict));

    // and sort <dict> from last to top by profit and word length
    dict.sort(function(a, b){ return (b.profit - a.profit) || (b.word.length - a.word.length); });

    //console.log(_dumpDict(dict));

    // assign replaceChar, replace words to replaceChars and remove unused words
    var t = s;
    var d, i = -1, j = 0; while((d = dict[++i]))
    {
    if(t.indexOf(d.word) > -1)
    {
    d.replaceChar = availableChars[j];
    t = t.replace(new RegExp(d.escapedWord, 'g'), d.replaceChar);
    dict[j] = d;
    j++;
    }
    }

    dict.length = dictLen = j;

    console.log('t = ', t);

    // compress <dict>
    // calc match one word in other words
    var d, i = -1; while((d = dict[++i]))
    d.matchCountInDict = _matchCountInDict(dict, d.word);

    // sort <dict> by matchCountInDict from max to min
    dict.sort(function(a, b){ return a.matchCountInDict - b.matchCountInDict; });

    // redice <dict>
    var d, i = -1; while((d = dict[++i]))
    {
    var word = d.word;

    var j = i, d2; while((d2 = dict[++i]))
    word = word.replace(new RegExp(d2.escapedWord, 'g'), d2.replaceChar);

    d.word = word;
    }

    //console.log(_dumpDict(dict));
    console.log(dict);
    //return;

    // final

    // serialize <dict>
    var dictString = '';
    var d, i = -1; while((d = dict[++i]))
    dictString += d.replaceChar + _escapeForRegExpReplace(d.word) + '`';

    dictString = dictString.slice(0, -1);

    console.log(dictString);

    // and create unpack code
    var code = unpackCode.
    replace('__DICT__', _escapeForCString(dictString)).
    replace('__CODE__', _escapeForCString(t))
    ;

    console.log('code.length = ', code.length);
    console.log('s.length = ', s.length);
    console.log('profit = ', (s.length/code.length*100).toFixed(2), '%');

    return code;
    };

    return _gzip;
    })();