I'm using the Javascript window.atob()
function to decode a base64-encoded string (specifically the base64-encoded content from the GitHub API). Problem is I'm getting ASCII-encoded characters back (like â¢
instead of ™
). How can I properly handle the incoming base64-encoded stream so that it's decoded as utf-8?
There's a great article on Mozilla's MDN docs that describes exactly this issue:
The "Unicode Problem" Since
DOMString
s are 16-bit-encoded strings, in most browsers callingwindow.btoa
on a Unicode string will cause aCharacter Out Of Range exception
if a character exceeds the range of a 8-bit byte (0x00~0xFF). There are two possible methods to solve this problem:
- the first one is to escape the whole string (with UTF-8, see
encodeURIComponent
) and then encode it;- the second one is to convert the UTF-16
DOMString
to an UTF-8 array of characters and then encode it.
A note on previous solutions: the MDN article originally suggested using unescape
and escape
to solve the Character Out Of Range
exception problem, but they have since been deprecated. Some other answers here have suggested working around this with decodeURIComponent
and encodeURIComponent
, this has proven to be unreliable and unpredictable. The most recent update to this answer uses modern JavaScript functions to improve speed and modernize code.
If you're trying to save yourself some time, you could also consider using a library:
function b64EncodeUnicode(str) {
// first we use encodeURIComponent to get percent-encoded UTF-8,
// then we convert the percent encodings into raw bytes which
// can be fed into btoa.
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
function toSolidBytes(match, p1) {
return String.fromCharCode('0x' + p1);
}));
}
b64EncodeUnicode('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"
b64DecodeUnicode('Cg=='); // "\n"
Here is the the current recommendation, direct from MDN, with some additional TypeScript compatibility via @MA-Maddin:
// Encoding UTF8 ⇢ base64
function b64EncodeUnicode(str) {
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g, function(match, p1) {
return String.fromCharCode(parseInt(p1, 16))
}))
}
b64EncodeUnicode('✓ à la mode') // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n') // "Cg=="
// Decoding base64 ⇢ UTF8
function b64DecodeUnicode(str) {
return decodeURIComponent(Array.prototype.map.call(atob(str), function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2)
}).join(''))
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU=') // "✓ à la mode"
b64DecodeUnicode('Cg==') // "\n"
This used escape
and unescape
(which are now deprecated, though this still works in all modern browsers):
function utf8_to_b64( str ) {
return window.btoa(unescape(encodeURIComponent( str )));
}
function b64_to_utf8( str ) {
return decodeURIComponent(escape(window.atob( str )));
}
// Usage:
utf8_to_b64('✓ à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64_to_utf8('4pyTIMOgIGxhIG1vZGU='); // "✓ à la mode"
And one last thing: I first encountered this problem when calling the GitHub API. To get this to work on (Mobile) Safari properly, I actually had to strip all white space from the base64 source before I could even decode the source. Whether or not this is still relevant in 2017, I don't know:
function b64_to_utf8( str ) {
str = str.replace(/\s/g, '');
return decodeURIComponent(escape(window.atob( str )));
}