Last active
August 1, 2022 03:26
-
-
Save mdciotti/121c3e16540b1b273b4bf96156b19baf to your computer and use it in GitHub Desktop.
Completely encode all characters of a string into a URL escape sequence.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// NOTE: This is overkill. I wrote this and then quickly realized there is a better way to do it. See below. | |
/** | |
* https://tc39.es/ecma262/#leading-surrogate | |
* @param {number} codeUnit | |
*/ | |
function isLeadingSurrogate(codeUnit) { | |
return 0xD800 <= codeUnit && codeUnit <=0xDBFF; | |
} | |
/** | |
* https://tc39.es/ecma262/#trailing-surrogate | |
* @param {number} codeUnit | |
*/ | |
function isTrailingSurrogate(codeUnit) { | |
return 0xDC00 <= codeUnit && codeUnit <=0xDFFF; | |
} | |
/** | |
* Two code units, lead and trail, that form a UTF-16 surrogate pair are converted to a code point. | |
* @param {number} lead | |
* @param {number} trail | |
* https://tc39.es/ecma262/#sec-utf16decode | |
*/ | |
function UTF16Decode(lead, trail) { | |
if (!(isLeadingSurrogate(lead) && isTrailingSurrogate(trail))) { | |
throw new Error('unpaired surrogate'); | |
} | |
return (lead - 0xD800) * 0x400 + (trail - 0xDC00) + 0x10000; | |
} | |
/** | |
* Transforms a UTF-16 code unit into UTF-8 octets. | |
* @param {string} string | |
* @param {number} position | |
*/ | |
function UTF8OctetsAt(string, position) { | |
const size = string.length; | |
const lead = string.charCodeAt(position); | |
if (lead <= 0x007F) { | |
return [lead]; | |
} | |
if (lead <= 0x07FF) { | |
const first = 0b11000000 | ((0b0000011111000000 & lead) >> 6) | |
const second = 0b10000000 | (0b0000000000111111 & lead); | |
return [first, second]; | |
} | |
if (lead <= 0xD7FF || lead >= 0xE000) { | |
const first = 0b11100000 | ((lead & 0b1111000000000000) >> 12); | |
const second = 0b10000000 | ((lead & 0b0000111111000000) >> 6); | |
const third = 0b10000000 | (lead & 0b0000000000111111); | |
return [first, second, third]; | |
} | |
if (isTrailingSurrogate(lead) || position + 1 === size) { | |
throw new Error('unpaired surrogate'); | |
} | |
const trail = string.charCodeAt(position + 1); | |
if (!isTrailingSurrogate(trail)) { | |
throw new Error('unpaired surrogate'); | |
} | |
// Each letter represents one bit from the lead or trail. | |
// lead: 110110vv vvwwwwxx | |
// trail: 110111yy yyzzzzzz | |
const vvvv = (0b0000001111000000 & lead) >> 6; | |
const uuuuu = vvvv + 1; | |
const wwww = (0b0000000000111100 & lead) >> 2; | |
const xx = (0b0000000000000011 & lead); | |
const yyyy = (0b0000001111000000 & trail) >> 6; | |
const zzzzzz = (0b0000000000111111 & trail); | |
// return 11110uuu 10uuwwww 10xxyyyy 10zzzzzz | |
const first = 0b11110000 | (uuuuu >> 2); | |
const second = 0b10000000 | ((uuuuu & 0b11) << 4) | wwww; | |
const third = 0b10000000 | (xx << 4) | yyyy; | |
const fourth = 0b10000000 | zzzzzz; | |
return [first, second, third, fourth]; | |
} | |
/** | |
* Interprets a String string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads from it a single code point starting with the code unit at index position. | |
* @param {string} string | |
* @param {number} position | |
* https://tc39.es/ecma262/#sec-codepointat | |
*/ | |
function CodePointAt(string, position) { | |
let size = string.length; | |
if (!(position >= 0 && position < size)) { | |
throw new RangeError(`invalid position ${position}`); | |
} | |
let first = string.charCodeAt(0); | |
let cp = first; | |
if (!isLeadingSurrogate(first) && !isTrailingSurrogate(first)) { | |
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: false }; | |
} | |
if (isTrailingSurrogate(first) || position + 1 === size) { | |
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true }; | |
} | |
let second = string.charCodeAt(position + 1); | |
if (!isTrailingSurrogate(second)) { | |
return { CodePoint: cp, CodeUnitCount: 1, IsUnpairedSurrogate: true }; | |
} | |
cp = UTF16Decode(first, second); | |
return { CodePoint: cp, CodeUnitCount: 2, IsUnpairedSurrogate: false }; | |
} | |
/** | |
* Encodes an entire string into URI escape codes. | |
* @param {string} value the string to escape | |
* https://tc39.es/ecma262/#sec-encode | |
*/ | |
function encodeURIComplete(value) { | |
let strLen = value.length; | |
let R = ''; | |
let k = 0; | |
while (true) { | |
if (k === strLen) return R; | |
let cp = CodePointAt(value, k); | |
if (cp.IsUnpairedSurrogate) throw new URIError('unpaired surrogate'); | |
let Octets = UTF8OctetsAt(value, k); | |
k = k + cp.CodeUnitCount; | |
for (const octet of Octets) { | |
R = R + '%' + octet.toString(16).toUpperCase().padStart(2, '0') | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Encodes an entire string into URI escape codes. | |
* @param {string} value the string to escape | |
* Relies on the browser to encode characters not present in the unescaped URIComponent set. | |
* Manually encodes the small set of characters that the browser does not. | |
*/ | |
function encodeURIComplete(value) { | |
return Array.from(value).map((s) => { | |
const defaultEncoded = encodeURIComponent(s); | |
if (defaultEncoded !== s) return defaultEncoded; | |
return '%' + s.charCodeAt(0) | |
.toString(16) | |
.toUpperCase() | |
.padStart(2, '0'); | |
}).join(''); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment