diff options
author | Guido Günther <agx@sigxcpu.org> | 2008-10-11 20:11:00 +0200 |
---|---|---|
committer | Guido Günther <agx@sigxcpu.org> | 2008-10-14 22:45:00 +0200 |
commit | f07e9117d0f018b6ec48d5f5dca599dcff280684 (patch) | |
tree | 45e2dc81fc178079a610bb4d26a56c7e4dc79b23 /odfrecode |
Initial commit
Diffstat (limited to 'odfrecode')
-rw-r--r-- | odfrecode/__init__.py | 105 | ||||
-rw-r--r-- | odfrecode/recoders/__init__.py | 38 | ||||
-rw-r--r-- | odfrecode/recoders/armscii.py | 125 | ||||
-rw-r--r-- | odfrecode/recoders/cyrillic.py | 151 | ||||
-rw-r--r-- | odfrecode/recoders/georgian.py | 65 | ||||
-rw-r--r-- | odfrecode/recoders/greek.py | 154 | ||||
-rw-r--r-- | odfrecode/recoders/recoder.py | 45 | ||||
-rw-r--r-- | odfrecode/recoders/romanian.py | 62 |
8 files changed, 745 insertions, 0 deletions
diff --git a/odfrecode/__init__.py b/odfrecode/__init__.py new file mode 100644 index 0000000..328100e --- /dev/null +++ b/odfrecode/__init__.py @@ -0,0 +1,105 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import os +import zipfile +import tempfile +import shutil +import xml.dom.minidom + +import recoders + +__xmls = [ 'content.xml', 'styles.xml' ] + +def __convert(textnode, weird_enc): + """remap source encoding to the corresponding unicode codepoints""" + newdata=u'' + for key in textnode.data: + newdata += weird_enc.recode(key) + textnode.data = newdata + + +def __recode_xml_tree(node, weird_enc): + if node.hasChildNodes(): + for kid in node.childNodes: + __recode_xml_tree(kid, weird_enc) + elif node.nodeType == node.TEXT_NODE: + __convert(node, weird_enc) + + +def __subst_fonts(contents, fontmap): + """substitute fonts according to fontmap""" + textprops = contents.getElementsByTagName('style:text-properties') + for prop in textprops: + oldfont = prop.getAttribute('style:font-name') + if not oldfont: + continue + try: + newfont = fontmap[oldfont] + prop.setAttribute('style:font-name', newfont) + except KeyError: + continue + + +def to_utf8(srcname, encoding, fontmap=None): + """ + convert a odf document from encoding to unicode + @param srcname: file to convert + @type srcname: string + @param encoding: destination encoding + @type encoding: Recoder subclass + @param fontmap: font substitution map { oldfont1: newfont1, oldfont2: newfont2 } + @type fontmap: dict + """ + + tempdir = tempfile.mkdtemp() + dstname = os.path.join(tempdir, os.path.basename(srcname)) + shutil.copyfile(srcname, dstname) + + # TODO: should handle the exception (no zipfile) + src_odf = zipfile.ZipFile(srcname, 'r') + dst_odf = zipfile.ZipFile(dstname, 'w') + + for fname in src_odf.namelist(): + # TODO: should handle the exception (fname not found) + data = src_odf.read(fname) + if fname in __xmls: + contents = xml.dom.minidom.parseString(data) + __recode_xml_tree(contents, encoding) + __subst_fonts(contents, fontmap) + data = contents.toxml('utf-8') + + dst_odf.writestr(fname, data) + + src_odf.close() + dst_odf.close() + return dstname + + +def get_recoder(encoding): + """get the recoder for a specific encoding""" + return recoders.recoders[encoding]() + +def get_recoders(): + """get a dict of all recoders""" + return recoders.recoders + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/odfrecode/recoders/__init__.py b/odfrecode/recoders/__init__.py new file mode 100644 index 0000000..5ec8e84 --- /dev/null +++ b/odfrecode/recoders/__init__.py @@ -0,0 +1,38 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008,2009 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder +import armscii +import georgian +import romanian +import cyrillic +import greek + +# List of available recoders +recoders = { + "armscii8": armscii.Armscii8, + "georgian": georgian.Georgian, + "romanian": romanian.Romanian, + "cyrillic": cyrillic.Cyrillic, + "greek": greek.Greek, + } + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/odfrecode/recoders/armscii.py b/odfrecode/recoders/armscii.py new file mode 100644 index 0000000..9098ba6 --- /dev/null +++ b/odfrecode/recoders/armscii.py @@ -0,0 +1,125 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# the mapping table is based on a utf16 encoded table of a vbscript that is: +# (c) 2003 VAHE GEVORGYAN, UNDER GPL LICENSE + +import recoder + +class Armscii8(recoder.Recoder): + """this class maps armscii8 to unicode""" + + encoding = 'armscii8' + dst_encoding = 'utf-16-be' + + character_table = { + u'\xd8': '\x05\x44', + u'\xd9': '\x05\x74', + u'\xd6': '\x05\x43', + u'\xd7': '\x05\x73', + u'\xd4': '\x05\x42', + u'\xd5': '\x05\x72', + u'\xd2': '\x05\x41', + u'\xd3': '\x05\x71', + u'\xda': '\x05\x45', + u'\xdb': '\x05\x75', + u'\xa4': '\x00\x29', + u'\xa5': '\x00\x28', + u'\xa6': '\x00\xbb', + u'\xa7': '\x00\xab', + u'\xdc': '\x05\x46', + u'\xc7': '\x05\x6b', + u'\xc6': '\x05\x3b', + u'\xc3': '\x05\x69', + u'\xc2': '\x05\x39', + u'\xc5': '\x05\x6a', + u'\xc4': '\x05\x3a', + u'\xbf': '\x05\x67', + u'\xbe': '\x05\x37', + u'\xc1': '\x05\x68', + u'\xc0': '\x05\x38', + u'\xfc': '\x05\x56', + u'\xfd': '\x05\x86', + u'\xfa': '\x05\x55', + u'\xfb': '\x05\x85', + u'\xfe': '\x05\x5a', + u'\xf5': '\x05\x82', + u'\xf4': '\x05\x52', + u'\xf7': '\x05\x83', + u'\xf6': '\x05\x53', + u'\xf1': '\x05\x80', + u'\xf0': '\x05\x50', + u'\xf3': '\x05\x81', + u'\xf2': '\x05\x51', + u'\xf9': '\x05\x84', + u'\xf8': '\x05\x54', + u'\xb3': '\x05\x61', + u'\xb2': '\x05\x31', + u'\xb1': '\x05\x5e', + u'\xb0': '\x05\x5b', + u'\xaf': '\x05\x5c', + u'\xae': '\x20\x26', + u'\xad': '\x05\x8a', + u'\xac': '\x00\x2d', + u'\xab': '\x00\x2c', + u'\xaa': '\x05\x5d', + u'\xb6': '\x05\x33', + u'\xb7': '\x05\x63', + u'\xb4': '\x05\x32', + u'\xb5': '\x05\x62', + u'\xba': '\x05\x35', + u'\xbb': '\x05\x65', + u'\xb8': '\x05\x34', + u'\xb9': '\x05\x64', + u'\xbc': '\x05\x36', + u'\xbd': '\x05\x66', + u'\xa8': '\x05\x87', + u'\xa9': '\x00\x2e', + u'\xe5': '\x05\x7a', + u'\xe4': '\x05\x4a', + u'\xe3': '\x05\x79', + u'\xe2': '\x05\x49', + u'\xe1': '\x05\x78', + u'\xe0': '\x05\x48', + u'\xdf': '\x05\x77', + u'\xde': '\x05\x47', + u'\xdd': '\x05\x76', + u'\xa3': '\x05\x89', + u'\xee': '\x05\x4f', + u'\xef': '\x05\x7f', + u'\xea': '\x05\x4d', + u'\xeb': '\x05\x7d', + u'\xec': '\x05\x4e', + u'\xed': '\x05\x7e', + u'\xe6': '\x05\x4b', + u'\xe7': '\x05\x7b', + u'\xe8': '\x05\x4c', + u'\xe9': '\x05\x7c', + u'\xc9': '\x05\x6c', + u'\xc8': '\x05\x3c', + u'\xcb': '\x05\x6d', + u'\xca': '\x05\x3d', + u'\xcd': '\x05\x6e', + u'\xcc': '\x05\x3e', + u'\xcf': '\x05\x6f', + u'\xce': '\x05\x3f', + u'\xd1': '\x05\x70', + u'\xd0': '\x05\x40', + } diff --git a/odfrecode/recoders/cyrillic.py b/odfrecode/recoders/cyrillic.py new file mode 100644 index 0000000..7a3cdda --- /dev/null +++ b/odfrecode/recoders/cyrillic.py @@ -0,0 +1,151 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2007 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Cyrillic(recoder.Recoder): + """ + this class maps cyrillic characters from 0x80-0xff to unicode + the mapping table is based on code from VeraWeb's madlan converter + """ + + encoding = 'cyrillic' + dst_encoding = 'utf-16-be' + + character_table = { + u'\x7f': '\x04\x0c', + u'\x80': '\x04\x02', + u'\x81': '\x04\x03', + u'\x82': '\x20\x1a', + u'\x83': '\x04\x53', + u'\x84': '\x20\x1e', + u'\x85': '\x20\x26', + u'\x86': '\x20\x20', + u'\x87': '\x20\x21', + u'\x88': '\x00\x20', + u'\x89': '\x20\x30', + u'\x8a': '\x04\x09', + u'\x8b': '\x20\x39', + u'\x8c': '\x04\x0a', + u'\x8d': '\x04\x0c', + u'\x8e': '\x04\x0b', + u'\x8f': '\x04\x0f', + u'\x90': '\x04\x52', + u'\x91': '\x20\x18', + u'\x92': '\x20\x19', + u'\x93': '\x20\x1c', + u'\x94': '\x20\x1d', + u'\x95': '\x20\x22', + u'\x96': '\x20\x13', + u'\x97': '\x20\x14', + u'\x98': '\x00\x20', + u'\x99': '\x21\x22', + u'\x9a': '\x04\x59', + u'\x9b': '\x20\x3a', + u'\x9c': '\x04\x5a', + u'\x9d': '\x04\x5c', + u'\x9e': '\x04\x5b', + u'\x9f': '\x04\x5f', + u'\xa0': '\x00\x20', + u'\xa1': '\x04\x0e', + u'\xa2': '\x04\x5e', + u'\xa3': '\x04\x08', + u'\xa5': '\x04\x90', + u'\xa8': '\x04\x01', + u'\xaa': '\x04\x04', + u'\xaf': '\x04\x07', + u'\xb2': '\x04\x06', + u'\xb3': '\x04\x57', + u'\xb4': '\x04\x91', + u'\xb8': '\x04\x51', + u'\xb9': '\x21\x16', + u'\xba': '\x04\x54', + u'\xbc': '\x04\x58', + u'\xbd': '\x04\x05', + u'\xbe': '\x04\x55', + u'\xbf': '\x04\x57', + u'\xc0': '\x04\x10', + u'\xc1': '\x04\x11', + u'\xc2': '\x04\x12', + u'\xc3': '\x04\x13', + u'\xc4': '\x04\x14', + u'\xc5': '\x04\x15', + u'\xc6': '\x04\x16', + u'\xc7': '\x04\x17', + u'\xc8': '\x04\x18', + u'\xc9': '\x04\x19', + u'\xca': '\x04\x1a', + u'\xcb': '\x04\x1b', + u'\xcc': '\x04\x1c', + u'\xcd': '\x04\x1d', + u'\xce': '\x04\x1e', + u'\xcf': '\x04\x1f', + u'\xd0': '\x04\x20', + u'\xd1': '\x04\x21', + u'\xd2': '\x04\x22', + u'\xd3': '\x04\x23', + u'\xd4': '\x04\x24', + u'\xd5': '\x04\x25', + u'\xd6': '\x04\x26', + u'\xd7': '\x04\x27', + u'\xd8': '\x04\x28', + u'\xd9': '\x04\x29', + u'\xda': '\x04\x2a', + u'\xdb': '\x04\x2b', + u'\xdc': '\x04\x2c', + u'\xdd': '\x04\x2d', + u'\xde': '\x04\x2e', + u'\xdf': '\x04\x2f', + u'\xe0': '\x04\x30', + u'\xe1': '\x04\x31', + u'\xe2': '\x04\x32', + u'\xe3': '\x04\x33', + u'\xe4': '\x04\x34', + u'\xe5': '\x04\x35', + u'\xe6': '\x04\x36', + u'\xe7': '\x04\x37', + u'\xe8': '\x04\x38', + u'\xe9': '\x04\x39', + u'\xea': '\x04\x3a', + u'\xeb': '\x04\x3b', + u'\xec': '\x04\x3c', + u'\xed': '\x04\x3d', + u'\xee': '\x04\x3e', + u'\xef': '\x04\x3f', + u'\xf0': '\x04\x40', + u'\xf1': '\x04\x41', + u'\xf2': '\x04\x42', + u'\xf3': '\x04\x43', + u'\xf4': '\x04\x44', + u'\xf5': '\x04\x45', + u'\xf6': '\x04\x46', + u'\xf7': '\x04\x47', + u'\xf8': '\x04\x48', + u'\xf9': '\x04\x49', + u'\xfa': '\x04\x4a', + u'\xfb': '\x04\x4b', + u'\xfc': '\x04\x4c', + u'\xfd': '\x04\x4d', + u'\xfe': '\x04\x4e', + u'\xff': '\x04\x4f', + } + diff --git a/odfrecode/recoders/georgian.py b/odfrecode/recoders/georgian.py new file mode 100644 index 0000000..247cc5a --- /dev/null +++ b/odfrecode/recoders/georgian.py @@ -0,0 +1,65 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# (c) 2008 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Georgian(recoder.Recoder): + """this class maps georgian to unicode""" + + encoding = 'georgian' + dst_encoding = 'utf-8' + + character_table = { + u'\x43': 'ჩ', + u'\x4a': 'ჟ', + u'\x52': 'ღ', + u'\x53': 'შ', + u'\x54': 'თ', + u'\x57': 'ჭ', + u'\x5a': 'ძ', + u'\x61': 'ა', + u'\x62': 'ბ', + u'\x63': 'ც', + u'\x64': 'დ', + u'\x65': 'ე', + u'\x66': 'ფ', + u'\x67': 'გ', + u'\x68': 'ჰ', + u'\x69': 'ი', + u'\x6a': 'ჯ', + u'\x6b': 'კ', + u'\x6c': 'ლ', + u'\x6d': 'მ', + u'\x6e': 'ნ', + u'\x6f': 'ო', + u'\x70': 'პ', + u'\x71': 'ქ', + u'\x72': 'რ', + u'\x73': 'ს', + u'\x74': 'ტ', + u'\x75': 'უ', + u'\x76': 'ვ', + u'\x77': 'წ', + u'\x78': 'ხ', + u'\x79': 'ყ', + u'\x7a': 'ზ', + } diff --git a/odfrecode/recoders/greek.py b/odfrecode/recoders/greek.py new file mode 100644 index 0000000..7610338 --- /dev/null +++ b/odfrecode/recoders/greek.py @@ -0,0 +1,154 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2007 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Greek(recoder.Recoder): + """ + this class maps greek characters from 0x80-0xff to unicode + the mapping table is based on code from VeraWeb's madlan converter + """ + + encoding = 'cyrillic' + dst_encoding = 'utf-16-be' + + character_table = { + u'\x82': '\x20\x1a', + u'\x83': '\x01\x92', + u'\x84': '\x20\x1e', + u'\x85': '\x20\x26', + u'\x86': '\x20\x20', + u'\x87': '\x20\x21', + u'\x88': '\x02\xc6', + u'\x89': '\x20\x30', + u'\x8a': '\x01\x60', + u'\x8b': '\x20\x39', + u'\x8c': '\x01\x52', + u'\x91': '\x20\x18', + u'\x92': '\x20\x19', + u'\x93': '\x20\x1c', + u'\x94': '\x20\x10', + u'\x95': '\x20\x22', + u'\x96': '\x20\x13', + u'\x97': '\x20\x14', + u'\x98': '\x02\xdc', + u'\x99': '\x21\x22', + u'\x9a': '\x01\x61', + u'\x9b': '\x20\x3a', + u'\x9c': '\x01\x53', + u'\x9f': '\x01\x78', + u'\xa1': '\x03\x85', + u'\xa2': '\x03\x86', + u'\xa3': '\x00\xa3', + u'\xa4': '\x00\xa4', + u'\xa5': '\x00\xa5', + u'\xa6': '\x00\xa6', + u'\xa7': '\x00\xa7', + u'\xa8': '\x00\xa8', + u'\xa9': '\x00\xa9', + u'\xaa': '\x00\xaa', + u'\xab': '\x00\xab', + u'\xac': '\x00\xac', + u'\xad': '\x00\xad', + u'\xae': '\x00\xae', + u'\xaf': '\x00\xaf', + u'\xb0': '\x00\xb0', + u'\xb1': '\x00\xb1', + u'\xb2': '\x00\xb2', + u'\xb3': '\x00\xb3', + u'\xb4': '\x00\xb4', + u'\xb5': '\x00\xb5', + u'\xb6': '\x00\xb6', + u'\xb7': '\x00\xb7', + u'\xb8': '\x03\x88', + u'\xb9': '\x03\x89', + u'\xba': '\x03\x8a', + u'\xbb': '\x00\xbb', + u'\xbc': '\x03\x8c', + u'\xbd': '\x00\xbd', + u'\xbe': '\x03\x8e', + u'\xbf': '\x03\x8f', + u'\xc0': '\x03\x90', + u'\xc1': '\x03\x91', + u'\xc2': '\x03\x92', + u'\xc3': '\x03\x93', + u'\xc4': '\x03\x94', + u'\xc5': '\x03\x95', + u'\xc6': '\x03\x96', + u'\xc7': '\x03\x97', + u'\xc8': '\x03\x98', + u'\xc9': '\x03\x99', + u'\xca': '\x03\x9a', + u'\xcb': '\x03\x9b', + u'\xcc': '\x03\x9c', + u'\xcd': '\x03\x9d', + u'\xce': '\x03\x9e', + u'\xcf': '\x03\x9f', + u'\xd0': '\x03\xa0', + u'\xd1': '\x03\xa1', + u'\xd2': '\x03\xda', + u'\xd3': '\x03\xa3', + u'\xd4': '\x03\xa4', + u'\xd5': '\x03\xa5', + u'\xd6': '\x03\xa6', + u'\xd7': '\x03\xa7', + u'\xd8': '\x03\xa8', + u'\xd9': '\x03\xa9', + u'\xda': '\x03\xaa', + u'\xdb': '\x03\xab', + u'\xdc': '\x03\xac', + u'\xdd': '\x03\xad', + u'\xde': '\x03\xae', + u'\xdf': '\x03\xaf', + u'\xe0': '\x03\xb0', + u'\xe1': '\x03\xb1', + u'\xe2': '\x03\xb2', + u'\xe3': '\x03\xb3', + u'\xe4': '\x03\xb4', + u'\xe5': '\x03\xb5', + u'\xe6': '\x03\xb6', + u'\xe7': '\x03\xb7', + u'\xe8': '\x03\xb8', + u'\xe9': '\x03\xb9', + u'\xea': '\x03\xba', + u'\xeb': '\x03\xbb', + u'\xec': '\x03\xbc', + u'\xed': '\x03\xbd', + u'\xee': '\x03\xbe', + u'\xef': '\x03\xbf', + u'\xf0': '\x03\xc0', + u'\xf1': '\x03\xc1', + u'\xf2': '\x03\xc2', + u'\xf3': '\x03\xc3', + u'\xf4': '\x03\xc4', + u'\xf5': '\x03\xc5', + u'\xf6': '\x03\xc6', + u'\xf7': '\x03\xc7', + u'\xf8': '\x03\xc8', + u'\xf9': '\x03\xc9', + u'\xfa': '\x03\xca', + u'\xfb': '\x03\xcb', + u'\xfc': '\x03\xcc', + u'\xfd': '\x03\xce', + u'\xfe': '\x03\xce', + } + diff --git a/odfrecode/recoders/recoder.py b/odfrecode/recoders/recoder.py new file mode 100644 index 0000000..01a3c1a --- /dev/null +++ b/odfrecode/recoders/recoder.py @@ -0,0 +1,45 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +class Recoder(object): + """ + base class for all encoders + @classvar dst_encoding: encoding of the destination "column" + @classvar encoding: name of the encoding + + To write a new encoder simply add a dictionary + + >>> character_table = { "national_encoding_1": "utf8_1", + national_encoding_2": "utf8_2", + } + + If you don't want to use utf8 as dictionary values specify dst_encoding + """ + dst_encoding = 'utf-8' + encoding = None + + def recode(self, character): + try: + char = unicode(self.character_table[character], self.dst_encoding) + except KeyError: # needs no remapping + char = character + return char + diff --git a/odfrecode/recoders/romanian.py b/odfrecode/recoders/romanian.py new file mode 100644 index 0000000..4f07e9d --- /dev/null +++ b/odfrecode/recoders/romanian.py @@ -0,0 +1,62 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2010 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Romanian(recoder.Recoder): + """ + this class maps Romanian to unicode + See https://bugzilla.redhat.com/show_bug.cgi?id=327501 + and http://en.wikipedia.org/wiki/Romanian_alphabet for details. + """ + + encoding = 'romanian' + dst_encoding = 'utf-8' + + character_table = { + # "S with comma below" (Unicode 0218) - incorrectly implemented as + # "S with cedilla below" (Unicode 015E) + u'Ş': 'Ș', + # "s with comma below" (Unicode 0219) - incorrectly implemented as + # "s with cedilla below" (Unicode 015F) + u'ş': 'ș', + # "T with comma below" (Unicode 021A) - incorrectly implemented as + # "T with cedilla below" (Unicode 0162) + u'Ţ': 'Ț', + # "t with comma below" (Unicode 021B) - incorrectly implemented as + # "t with cedilla below" (Unicode 0163) + u'ţ': 'ț', + # Furthermore the Microsoft's EasternRoman Font has mappings from + # "Latin-1 supplement" of these characters: + # S with comma below at 0xaa + u'ª': 'Ș', + # s with comma below at 0xba + u'º': 'ș', + # T with comma below at 0xde: + u'Þ': 'Ț', + # t with comma below at 0xfe: + u'þ': 'ț', + # A with breve at 0xc3: + u'Ã': 'Ă', + # a with breve at 0xe3: + u'ã': 'ă', + } + |