#include "assert.h" #include "array.h" #include #include namespace stingray_plugin_foundation { inline char *encoding::utf8_encode(int c, char *utf8) { if (c<0x80) { *utf8 = c; return utf8 + 1; } else if (c<0x800) { utf8[0] = (c>>6)|0xC0; utf8[1] = (c&0x3F)|0x80; return utf8 + 2; } else if (c<0x10000) { utf8[0] = (c>>12)|0xE0; utf8[1] = ((c>>6)&0x3F)|0x80; utf8[2] = (c&0x3F)|0x80; return utf8 + 3; } else if (c<0x110000) { utf8[0] = (c>>18)|0xf0; utf8[1] = ((c>>12)&0x3F)|0x80; utf8[2] = ((c>>6)&0x3F)|0x80; utf8[3] = (c&0x3F)|0x80; return utf8 + 4; } else { XERROR("Cannot encode character"); } return utf8; } inline const char *encoding::utf8_decode(const char *utf8, int &codepoint) { char c = *utf8; if ((c&0x80)==0x0) { codepoint = c; return utf8 + 1; } else if ((c&0xE0)==0xC0) { unsigned char d = utf8[1]; XASSERT((d&0xC0)==0x80, "Archive not utf-8 encoded %s", utf8); codepoint = (static_cast(c&0x1F)<<6) | static_cast(d&0x3F); return utf8 + 2; } else if ((c&0xF0)==0xE0) { const char *d = utf8 + 1; XASSERT(((d[0]&0xC0)==0x80) && ((d[1]&0xC0)==0x80), "Archive not utf-8 encoded %s", utf8); codepoint = (static_cast(c&0x0f)<<12) | (static_cast(d[0]&0x3f)<<6) | static_cast(d[1]&0x3f); return utf8 + 3; } else if ((c&0xf8)==0xf0) { const char *d = utf8 + 1; XASSERT(((d[0]&0xc0)==0x80) && ((d[1]&0xc0)==0x80) && ((d[2]&0xc0)==0x80), "Archive not utf-8 encoded %s", utf8); codepoint = (static_cast(c&0x07)<<18) | (static_cast(d[0]&0x3f)<<12) | (static_cast(d[1]&0x3f)<<6) | static_cast(d[2]&0x3f); return utf8 + 4; } else { XERROR("Archive not utf-8 encoded %s", utf8); return utf8; } } inline unsigned encoding::utf8_codepoint_bytes(const char *buffer) { char c = *buffer; if ((c&0x80)==0x0) return 1; else if ((c&0xE0)==0xC0) return 2; else if ((c&0xF0)==0xE0) return 3; else if ((c&0xf8)==0xf0) return 4; else { XERROR("Length on part of utf-8 character"); return 1; } } inline void encoding::utf8_decode(const char *utf8, Array &codepoints) { while (*utf8) { int c; utf8 = utf8_decode(utf8, c); codepoints.push_back((unsigned)c); } } inline void encoding::utf8_encode(const Array &codepoints, Array &utf8) { utf8_encode(codepoints.begin(), codepoints.size(), utf8); } inline void encoding::utf8_encode(const unsigned *codepoints, unsigned size, Array &utf8) { const unsigned *it(codepoints), *end(codepoints + size); for(; it != end; ++it) { utf8.reserve(utf8.size() + 4); const char *new_ptr = utf8_encode(*it, utf8.end()); utf8.resize((unsigned)(new_ptr - utf8.begin())); } } inline void encoding::utf8_location(const char *utf8, unsigned index, unsigned &begin, unsigned &end) { XASSERT(index < strlen(utf8), "Index out of string"); // walk backwards to find beginning of utf8 character begin = index; while(true) { if (begin == 0) break; if ((utf8[begin] & 0xc0) != 0x80) break; --begin; } end = begin + utf8_codepoint_bytes(utf8 + begin); } inline const char * encoding::utf8_valid_first(const char *utf8) { char c = *utf8; const char *d = utf8 + 1; if ((c&0x80)==0x0) return utf8 + 1; else if ((c&0xE0)==0xC0) return (d[0]&0xC0)==0x80 ? utf8+2 : nullptr; else if ((c&0xF0)==0xE0) return (d[0]&0xC0)==0x80 && (d[1]&0xC0)==0x80 ? utf8+3 : nullptr; else if ((c&0xf8)==0xf0) return (d[0]&0xc0)==0x80 && (d[1]&0xc0)==0x80 && (d[2]&0xc0)==0x80 ? utf8+4 : nullptr; else return nullptr; } inline bool encoding::utf8_valid_all(const char *utf8) { while (*utf8) { utf8 = utf8_valid_first(utf8); if (!utf8) return false; } return true; } inline unsigned encoding::wstr_to_utf8_bytes(const wchar_t *ucs2) { char buffer[5]; unsigned size = 0; while (*ucs2) { size += (unsigned)(utf8_encode(*ucs2, &buffer[0]) - &buffer[0]); ++ucs2; } ++size; return size; } inline void encoding::wstr_to_utf8(const wchar_t *ucs2, char *utf8, unsigned size) { char *b = utf8; while (*ucs2) { b = utf8_encode(*ucs2, b); ++ucs2; } XENSURE(unsigned(b - utf8) < size); *b = 0; } inline void encoding::wstr_to_utf8(const wchar_t *ucs2, Array &utf8) { unsigned n = wstr_to_utf8_bytes(ucs2); utf8.resize(n); wstr_to_utf8(ucs2, utf8.begin(), n); } inline unsigned encoding::utf8_to_wstr_tokens(const char *utf8) { unsigned tokens = 0; while (*utf8) { ++tokens; utf8 += utf8_codepoint_bytes(utf8); } ++tokens; return tokens; } inline void encoding::utf8_to_wstr(const char *utf8, wchar_t *ucs2, unsigned tokens) { wchar_t *b = ucs2; while (*utf8) { int c; utf8 = utf8_decode(utf8, c); *b = c; ++b; } XENSURE(unsigned(b - ucs2) < tokens); *b = 0; } inline void encoding::utf8_to_wstr(const char *utf8, Array &ucs2) { unsigned n = utf8_to_wstr_tokens(utf8); ucs2.resize(n); utf8_to_wstr(utf8, ucs2.begin(), n); } inline wchar_t *encoding::utf8_to_wstr(const char *utf8, Allocator &a) { unsigned n = utf8_to_wstr_tokens(utf8); wchar_t *res = (wchar_t *)a.allocate(sizeof(wchar_t)*n); utf8_to_wstr(utf8, res, n); return res; } } // namespace stingray_plugin_foundation