diff options
author | Guido Günther <agx@sigxcpu.org> | 2014-02-05 08:38:27 +0100 |
---|---|---|
committer | Guido Günther <agx@sigxcpu.org> | 2014-02-05 08:38:27 +0100 |
commit | 14d771b90f5a7d3887e5e900d1fb4737477ad305 (patch) | |
tree | f382e3359d20916ae60d28361e59635e373224f8 /src/mm-charsets.c | |
parent | a09050a7f63a262bf90dcb1c7a41f9cfd205db43 (diff) |
Imported Upstream version 0.5.2.0upstream/0.5.2.0
Diffstat (limited to 'src/mm-charsets.c')
-rw-r--r-- | src/mm-charsets.c | 320 |
1 files changed, 293 insertions, 27 deletions
diff --git a/src/mm-charsets.c b/src/mm-charsets.c index cbdf388..832a06f 100644 --- a/src/mm-charsets.c +++ b/src/mm-charsets.c @@ -18,6 +18,7 @@ #include <stdlib.h> #include <unistd.h> #include <string.h> +#include <ctype.h> #include "mm-charsets.h" #include "mm-utils.h" @@ -36,8 +37,8 @@ static CharsetEntry charset_map[] = { { "IRA", "ASCII", "ASCII", "ASCII//TRANSLIT", MM_MODEM_CHARSET_IRA }, { "GSM", NULL, NULL, NULL, MM_MODEM_CHARSET_GSM }, { "8859-1", NULL, "ISO8859-1", "ISO8859-1//TRANSLIT", MM_MODEM_CHARSET_8859_1 }, - { "PCCP437", NULL, NULL, NULL, MM_MODEM_CHARSET_PCCP437 }, - { "PCDN", NULL, NULL, NULL, MM_MODEM_CHARSET_PCDN }, + { "PCCP437", "CP437", "CP437", "CP437//TRANSLIT", MM_MODEM_CHARSET_PCCP437 }, + { "PCDN", "CP850", "CP850", "CP850//TRANSLIT", MM_MODEM_CHARSET_PCDN }, { "HEX", NULL, NULL, NULL, MM_MODEM_CHARSET_HEX }, { NULL, NULL, NULL, NULL, MM_MODEM_CHARSET_UNKNOWN } }; @@ -160,7 +161,8 @@ mm_modem_charset_hex_to_utf8 (const char *src, MMModemCharset charset) g_return_val_if_fail (iconv_from != NULL, FALSE); unconverted = utils_hexstr2bin (src, &unconverted_len); - g_return_val_if_fail (unconverted != NULL, NULL); + if (!unconverted) + return NULL; if (charset == MM_MODEM_CHARSET_UTF8 || charset == MM_MODEM_CHARSET_IRA) return unconverted; @@ -425,6 +427,180 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len) return g_byte_array_free (gsm, FALSE); } +static gboolean +gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + guint8 gsm; + + *out_clen = 1; + if (utf8_to_gsm_def_char (utf8, ulen, &gsm)) + return TRUE; + if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) { + *out_clen = 2; + return TRUE; + } + return FALSE; +} + +static gboolean +ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + *out_clen = 1; + return (ulen == 1); +} + +static gboolean +ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + *out_clen = 2; + return (c <= 0xFFFF); +} + +static gboolean +iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + *out_clen = 1; + return (c <= 0xFF); +} + +static gboolean +pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + static const gunichar t[] = { + 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, + 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, + 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, + 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, + 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, + 0x2562, 0x2556, 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, + 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, + 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, + 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, 0x2518, + 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, 0x00df, 0x0393, + 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, 0x03a6, 0x0398, 0x03a9, 0x03b4, + 0x221e, 0x03c6, 0x03b5, 0x2229, 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, + 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, + 0x25a0, 0x00a0 + }; + int i; + + *out_clen = 1; + + if (c <= 0x7F) + return TRUE; + for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { + if (c == t[i]) + return TRUE; + } + return FALSE; +} + +static gboolean +pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen) +{ + static const gunichar t[] = { + 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, + 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x00c9, 0x00e6, + 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, 0x00d6, 0x00dc, + 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192, 0x00e1, 0x00ed, 0x00f3, 0x00fa, + 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, + 0x00a1, 0x00ab, 0x00bb, 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, + 0x00c2, 0x00c0, 0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, + 0x2510, 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3, + 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4, 0x00f0, + 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce, 0x00cf, 0x2518, + 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580, 0x00d3, 0x00df, 0x00d4, + 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe, 0x00de, 0x00da, 0x00db, 0x00d9, + 0x00fd, 0x00dd, 0x00af, 0x00b4, 0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, + 0x00a7, 0x00f7, 0x00b8, 0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, + 0x25a0, 0x00a0 + }; + int i; + + *out_clen = 1; + + if (c <= 0x7F) + return TRUE; + for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) { + if (c == t[i]) + return TRUE; + } + return FALSE; +} + +typedef struct { + MMModemCharset cs; + gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen); + guint charsize; +} SubsetEntry; + +SubsetEntry subset_table[] = { + { MM_MODEM_CHARSET_GSM, gsm_is_subset }, + { MM_MODEM_CHARSET_IRA, ira_is_subset }, + { MM_MODEM_CHARSET_UCS2, ucs2_is_subset }, + { MM_MODEM_CHARSET_8859_1, iso88591_is_subset }, + { MM_MODEM_CHARSET_PCCP437, pccp437_is_subset }, + { MM_MODEM_CHARSET_PCDN, pcdn_is_subset }, + { MM_MODEM_CHARSET_UNKNOWN, NULL }, +}; + +/** + * mm_charset_get_encoded_len: + * + * @utf8: UTF-8 valid string + * @charset: the #MMModemCharset to check the length of @utf8 in + * @out_unsupported: on return, number of characters of @utf8 that are not fully + * representable in @charset + * + * Returns: the size in bytes of the string if converted from UTF-8 into @charset. + **/ +guint +mm_charset_get_encoded_len (const char *utf8, + MMModemCharset charset, + guint *out_unsupported) +{ + const char *p = utf8, *next; + guint len = 0, unsupported = 0; + SubsetEntry *e; + + g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0); + g_return_val_if_fail (utf8 != NULL, 0); + + if (charset == MM_MODEM_CHARSET_UTF8) + return strlen (utf8); + + /* Find the charset in our subset table */ + for (e = &subset_table[0]; + e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN; + e++); + g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0); + + while (*p) { + gunichar c; + const char *end; + guint clen = 0; + + c = g_utf8_get_char_validated (p, -1); + g_return_val_if_fail (c != (gunichar) -1, 0); + end = next = g_utf8_find_next_char (p, NULL); + if (end == NULL) { + /* Find the end... */ + end = p; + while (*end++); + } + + if (!e->func (c, p, (end - p), &clen)) + unsupported++; + len += clen; + p = next; + } + + if (out_unsupported) + *out_unsupported = unsupported; + return len; +} + guint8 * gsm_unpack (const guint8 *gsm, guint32 num_septets, @@ -467,37 +643,127 @@ gsm_pack (const guint8 *src, guint8 start_offset, guint32 *out_packed_len) { - GByteArray *packed; - guint8 c, add_last = 0; - int i; + guint8 *packed; + guint octet = 0, lshift, plen; + int i = 0; - packed = g_byte_array_sized_new (src_len); + g_return_val_if_fail (start_offset < 8, NULL); - for (i = 0, c = 0; i < src_len; i++) { - guint8 bits_here, offset; - guint32 start_bit; + plen = (src_len * 7) + start_offset; /* total length in bits */ + if (plen % 8) + plen += 8; + plen /= 8; /* now in bytes */ - start_bit = start_offset + (i * 7); /* Overall bit offset of char in buffer */ - offset = start_bit % 8; /* Offset to start of char in this byte */ - bits_here = offset ? (8 - offset) : 7; + packed = g_malloc0 (plen); - c |= (src[i] & 0x7F) << offset; - if (offset) { - /* Add this packed byte */ - g_byte_array_append (packed, &c, 1); - c = add_last = 0; + for (i = 0, lshift = start_offset; i < src_len; i++) { + packed[octet] |= (src[i] & 0x7F) << lshift; + if (lshift > 1) { + /* Grab the lost bits and add to next octet */ + g_assert (octet + 1 < plen); + packed[octet + 1] = (src[i] & 0x7F) >> (8 - lshift); } + if (lshift) + octet++; + lshift = lshift ? lshift - 1 : 7; + } - /* Pack the rest of this char into the next byte */ - if (bits_here != 7) { - c = (src[i] & 0x7F) >> bits_here; - add_last = 1; + if (out_packed_len) + *out_packed_len = plen; + return packed; +} + +/* We do all our best to get the given string, which is possibly given in the + * specified charset, to UTF8. It may happen that the given string is really + * the hex representation of the charset-encoded string, so we need to cope with + * that case. */ +gchar * +mm_charset_take_and_convert_to_utf8 (gchar *str, + MMModemCharset charset) +{ + gchar *utf8 = NULL; + + if (!str) + return NULL; + + switch (charset) { + case MM_MODEM_CHARSET_UNKNOWN: + g_warn_if_reached (); + utf8 = str; + break; + + case MM_MODEM_CHARSET_HEX: + /* We'll assume that the HEX string is really valid ASCII at the end */ + utf8 = str; + break; + + case MM_MODEM_CHARSET_GSM: + case MM_MODEM_CHARSET_8859_1: + case MM_MODEM_CHARSET_PCCP437: + case MM_MODEM_CHARSET_PCDN: { + const gchar *iconv_from; + GError *error = NULL; + + iconv_from = charset_iconv_from (charset); + utf8 = g_convert (str, strlen (str), + "UTF-8//TRANSLIT", iconv_from, + NULL, NULL, &error); + if (!utf8 || error) { + g_clear_error (&error); + utf8 = NULL; } + + g_free (str); + break; } - if (add_last) - g_byte_array_append (packed, &c, 1); - *out_packed_len = packed->len; - return g_byte_array_free (packed, FALSE); -} + case MM_MODEM_CHARSET_UCS2: { + gsize len; + gboolean possibly_hex = TRUE; + /* If the string comes in hex-UCS-2, len needs to be a multiple of 4 */ + len = strlen (str); + if ((len < 4) || ((len % 4) != 0)) + possibly_hex = FALSE; + else { + const gchar *p = str; + + /* All chars in the string must be hex */ + while (*p && possibly_hex) + possibly_hex = isxdigit (*p++); + } + + /* If we get UCS-2, we expect the HEX representation of the string */ + if (possibly_hex) { + utf8 = mm_modem_charset_hex_to_utf8 (str, charset); + if (!utf8) { + /* If we couldn't convert the string as HEX-UCS-2, try to see if + * the string is valid UTF-8 itself. */ + utf8 = str; + } else + g_free (str); + } else + /* If we already know it's not hex, try to use the string as it is */ + utf8 = str; + + break; + } + + /* If the given charset is ASCII or UTF8, we really expect the final string + * already here */ + case MM_MODEM_CHARSET_IRA: + case MM_MODEM_CHARSET_UTF8: + utf8 = str; + break; + } + + /* Validate UTF-8 always before returning. This result will be exposed in DBus + * very likely... */ + if (!g_utf8_validate (utf8, -1, NULL)) { + /* Better return NULL than an invalid UTF-8 string */ + g_free (utf8); + utf8 = NULL; + } + + return utf8; +} |