diff options
-rw-r--r-- | bin/odfrecode | 75 | ||||
-rw-r--r-- | bin/odfrecode-gtk | 116 | ||||
-rw-r--r-- | odfrecode/__init__.py | 105 | ||||
-rw-r--r-- | odfrecode/recoders/__init__.py | 38 | ||||
-rw-r--r-- | odfrecode/recoders/armscii.py | 125 | ||||
-rw-r--r-- | odfrecode/recoders/cyrillic.py | 151 | ||||
-rw-r--r-- | odfrecode/recoders/georgian.py | 65 | ||||
-rw-r--r-- | odfrecode/recoders/greek.py | 154 | ||||
-rw-r--r-- | odfrecode/recoders/recoder.py | 45 | ||||
-rw-r--r-- | odfrecode/recoders/romanian.py | 62 | ||||
-rw-r--r-- | setup.py | 13 |
11 files changed, 949 insertions, 0 deletions
diff --git a/bin/odfrecode b/bin/odfrecode new file mode 100644 index 0000000..401f46d --- /dev/null +++ b/bin/odfrecode @@ -0,0 +1,75 @@ +#!/usr/bin/python +# vim:encoding=utf-8:fileencoding=utf-8 +# +# simple national to unicode open document format converter +# +# Migrating from M* W*rd to ODF can be a pain if documents use characters from +# fonts that only support some non unicode encodings like armscii. Remap these +# to the appriopriate unicode codepoints. Any corrections are very welcome. +# +# (c) 2007,2008,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2008 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import sys, os, shutil +import odfrecode +from optparse import OptionParser + +def main(argv): + parser = OptionParser() + parser.add_option("--recoder", dest="recoder", + help="use recoder RECODER", metavar="recoder") + + encoding = sys.getfilesystemencoding() + + (options, args) = parser.parse_args() + + if len(args) != 1: + print >>sys.stderr, "Need a file to convert." + return 1 + else: + filename = os.path.abspath(args[0]) + + if not options.recoder: + print >>sys.stderr, "Missing recoder." + return 1 + else: + try: + converter = odfrecode.get_recoder(options.recoder) + except KeyError: + print >>sys.stderr, "No recoder for '%s' found." % options.recoder + print >>sys.stderr, "Available recoders: %s." % odfrecode.recoders.recoders.keys() + return 1 + + prefix, postfix = filename.decode(encoding).rsplit(u'.', 1) + backup = u"%s.%s.%s" % (prefix, options.recoder, postfix) + try: + os.unlink(backup) + except OSError: + pass # file doesn't exist + shutil.copy(filename, backup) + + dstname = odfrecode.to_utf8(filename, converter) + os.unlink(filename) + shutil.copy(dstname, filename) + os.unlink(dstname) + + print filename + +if __name__ == "__main__": + main(sys.argv) + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/bin/odfrecode-gtk b/bin/odfrecode-gtk new file mode 100644 index 0000000..bd822c3 --- /dev/null +++ b/bin/odfrecode-gtk @@ -0,0 +1,116 @@ +#!/usr/bin/python +# vim:encoding=utf-8:fileencoding=utf-8 +# +# call the apropriate recoders for the unicode open document format converter +# +# Migrating from M* W*rd to ODF can be a pain if documents use characters from +# fonts that only support some non unicode encodings like armscii. Remap these +# to the appriopriate unicode codepoints. Any corrections are very welcome. +# +# (c) 2007,2008,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2008 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import sys, os, shutil +import odfrecode +from optparse import OptionParser +import gtk +import gnomevfs + +fontmap = { "EastRoman": "DejaVu Sans" } + +class ConvError(Exception): + pass + +class SuccessDialog(gtk.Dialog): + def __init__(self, filename, backup): + self.uri = "file://%s" % filename + gtk.Dialog.__init__(self, buttons=(gtk.STOCK_OK, gtk.RESPONSE_ACCEPT)) + label = gtk.Label("succesfully converted to Unicode.") + uri = gtk.LinkButton(self.uri, label=os.path.basename(filename)) + uri.connect('clicked', self.show_uri) + uri.set_tooltip_text("Click to open '%s'" % filename) + + stock = gtk.image_new_from_stock(gtk.STOCK_DIALOG_INFO, + gtk.ICON_SIZE_DIALOG) + hbox = gtk.HBox(False, 3) + hbox.pack_start(stock, True, True, 0) + hbox.add(uri) + hbox.add(label) + self.vbox.add(hbox) + self.vbox.add(gtk.Label("A backup copy was saved as '%s'." + % os.path.basename(backup))) + self.show_all() + + def show_uri(self, dummy): + gnomevfs.url_show(self.uri) + self.destroy() + +def main(argv): + try: + encoding = sys.getfilesystemencoding() + parser = OptionParser() + parser.add_option("--recoder", dest="recoder", + help="use recoder RECODER", metavar="recoder") + (options, args) = parser.parse_args() + + if len(args) != 1: + raise ConvError, "Need a file to convert." + else: + filename = os.path.abspath(args[0]) + try: + fileinfo = gnomevfs.get_file_info(filename, gnomevfs.FILE_INFO_GET_MIME_TYPE) + except gnomevfs.NotFoundError: + raise ConvError, "'%s' nicht gefunden" % filename + if fileinfo.mime_type not in [ "application/vnd.oasis.opendocument.text", + "application/vnd.oasis.opendocument.text-template" ]: + raise ConvError, "'%s' is no OpenDocument Text, but '%s'." % (filename, fileinfo.mime_type) + + if not options.recoder: + # FIXME: list available recoders + raise ConvError, "Missing converter.\nSupported converters are: '%s'" % \ + ", ".join(odfrecode.get_recoders().keys()) + else: + try: + converter = odfrecode.get_recoder(options.recoder) + except KeyError: + raise ConvError, "No recoder for '%s' found." % options.recoder + + prefix, postfix = filename.decode(encoding).rsplit(u'.', 1) + backup = u"%s.%s.%s" % (prefix, options.recoder, postfix) + try: + os.unlink(backup) + except OSError: + pass # file doesn't exist + shutil.copy(filename, backup) + + dstname = odfrecode.to_utf8(filename, converter, fontmap) + os.unlink(filename) + shutil.copy(dstname, filename) + os.unlink(dstname) + dialog = SuccessDialog(filename, backup) + except ConvError, error: + dialog = gtk.MessageDialog(type=gtk.MESSAGE_ERROR, buttons=gtk.BUTTONS_OK, + message_format="Error during conversion") + dialog.format_secondary_text(str(error)) + dialog.set_property("title", "ODF Charset Converter") + dialog.run() + dialog.destroy() + +if __name__ == "__main__": + main(sys.argv) + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/odfrecode/__init__.py b/odfrecode/__init__.py new file mode 100644 index 0000000..328100e --- /dev/null +++ b/odfrecode/__init__.py @@ -0,0 +1,105 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import os +import zipfile +import tempfile +import shutil +import xml.dom.minidom + +import recoders + +__xmls = [ 'content.xml', 'styles.xml' ] + +def __convert(textnode, weird_enc): + """remap source encoding to the corresponding unicode codepoints""" + newdata=u'' + for key in textnode.data: + newdata += weird_enc.recode(key) + textnode.data = newdata + + +def __recode_xml_tree(node, weird_enc): + if node.hasChildNodes(): + for kid in node.childNodes: + __recode_xml_tree(kid, weird_enc) + elif node.nodeType == node.TEXT_NODE: + __convert(node, weird_enc) + + +def __subst_fonts(contents, fontmap): + """substitute fonts according to fontmap""" + textprops = contents.getElementsByTagName('style:text-properties') + for prop in textprops: + oldfont = prop.getAttribute('style:font-name') + if not oldfont: + continue + try: + newfont = fontmap[oldfont] + prop.setAttribute('style:font-name', newfont) + except KeyError: + continue + + +def to_utf8(srcname, encoding, fontmap=None): + """ + convert a odf document from encoding to unicode + @param srcname: file to convert + @type srcname: string + @param encoding: destination encoding + @type encoding: Recoder subclass + @param fontmap: font substitution map { oldfont1: newfont1, oldfont2: newfont2 } + @type fontmap: dict + """ + + tempdir = tempfile.mkdtemp() + dstname = os.path.join(tempdir, os.path.basename(srcname)) + shutil.copyfile(srcname, dstname) + + # TODO: should handle the exception (no zipfile) + src_odf = zipfile.ZipFile(srcname, 'r') + dst_odf = zipfile.ZipFile(dstname, 'w') + + for fname in src_odf.namelist(): + # TODO: should handle the exception (fname not found) + data = src_odf.read(fname) + if fname in __xmls: + contents = xml.dom.minidom.parseString(data) + __recode_xml_tree(contents, encoding) + __subst_fonts(contents, fontmap) + data = contents.toxml('utf-8') + + dst_odf.writestr(fname, data) + + src_odf.close() + dst_odf.close() + return dstname + + +def get_recoder(encoding): + """get the recoder for a specific encoding""" + return recoders.recoders[encoding]() + +def get_recoders(): + """get a dict of all recoders""" + return recoders.recoders + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/odfrecode/recoders/__init__.py b/odfrecode/recoders/__init__.py new file mode 100644 index 0000000..5ec8e84 --- /dev/null +++ b/odfrecode/recoders/__init__.py @@ -0,0 +1,38 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008,2009 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder +import armscii +import georgian +import romanian +import cyrillic +import greek + +# List of available recoders +recoders = { + "armscii8": armscii.Armscii8, + "georgian": georgian.Georgian, + "romanian": romanian.Romanian, + "cyrillic": cyrillic.Cyrillic, + "greek": greek.Greek, + } + +# vim:et:ts=4:sw=4:et:sts=4:ai:set list listchars=tab\:»·,trail\:·: diff --git a/odfrecode/recoders/armscii.py b/odfrecode/recoders/armscii.py new file mode 100644 index 0000000..9098ba6 --- /dev/null +++ b/odfrecode/recoders/armscii.py @@ -0,0 +1,125 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# the mapping table is based on a utf16 encoded table of a vbscript that is: +# (c) 2003 VAHE GEVORGYAN, UNDER GPL LICENSE + +import recoder + +class Armscii8(recoder.Recoder): + """this class maps armscii8 to unicode""" + + encoding = 'armscii8' + dst_encoding = 'utf-16-be' + + character_table = { + u'\xd8': '\x05\x44', + u'\xd9': '\x05\x74', + u'\xd6': '\x05\x43', + u'\xd7': '\x05\x73', + u'\xd4': '\x05\x42', + u'\xd5': '\x05\x72', + u'\xd2': '\x05\x41', + u'\xd3': '\x05\x71', + u'\xda': '\x05\x45', + u'\xdb': '\x05\x75', + u'\xa4': '\x00\x29', + u'\xa5': '\x00\x28', + u'\xa6': '\x00\xbb', + u'\xa7': '\x00\xab', + u'\xdc': '\x05\x46', + u'\xc7': '\x05\x6b', + u'\xc6': '\x05\x3b', + u'\xc3': '\x05\x69', + u'\xc2': '\x05\x39', + u'\xc5': '\x05\x6a', + u'\xc4': '\x05\x3a', + u'\xbf': '\x05\x67', + u'\xbe': '\x05\x37', + u'\xc1': '\x05\x68', + u'\xc0': '\x05\x38', + u'\xfc': '\x05\x56', + u'\xfd': '\x05\x86', + u'\xfa': '\x05\x55', + u'\xfb': '\x05\x85', + u'\xfe': '\x05\x5a', + u'\xf5': '\x05\x82', + u'\xf4': '\x05\x52', + u'\xf7': '\x05\x83', + u'\xf6': '\x05\x53', + u'\xf1': '\x05\x80', + u'\xf0': '\x05\x50', + u'\xf3': '\x05\x81', + u'\xf2': '\x05\x51', + u'\xf9': '\x05\x84', + u'\xf8': '\x05\x54', + u'\xb3': '\x05\x61', + u'\xb2': '\x05\x31', + u'\xb1': '\x05\x5e', + u'\xb0': '\x05\x5b', + u'\xaf': '\x05\x5c', + u'\xae': '\x20\x26', + u'\xad': '\x05\x8a', + u'\xac': '\x00\x2d', + u'\xab': '\x00\x2c', + u'\xaa': '\x05\x5d', + u'\xb6': '\x05\x33', + u'\xb7': '\x05\x63', + u'\xb4': '\x05\x32', + u'\xb5': '\x05\x62', + u'\xba': '\x05\x35', + u'\xbb': '\x05\x65', + u'\xb8': '\x05\x34', + u'\xb9': '\x05\x64', + u'\xbc': '\x05\x36', + u'\xbd': '\x05\x66', + u'\xa8': '\x05\x87', + u'\xa9': '\x00\x2e', + u'\xe5': '\x05\x7a', + u'\xe4': '\x05\x4a', + u'\xe3': '\x05\x79', + u'\xe2': '\x05\x49', + u'\xe1': '\x05\x78', + u'\xe0': '\x05\x48', + u'\xdf': '\x05\x77', + u'\xde': '\x05\x47', + u'\xdd': '\x05\x76', + u'\xa3': '\x05\x89', + u'\xee': '\x05\x4f', + u'\xef': '\x05\x7f', + u'\xea': '\x05\x4d', + u'\xeb': '\x05\x7d', + u'\xec': '\x05\x4e', + u'\xed': '\x05\x7e', + u'\xe6': '\x05\x4b', + u'\xe7': '\x05\x7b', + u'\xe8': '\x05\x4c', + u'\xe9': '\x05\x7c', + u'\xc9': '\x05\x6c', + u'\xc8': '\x05\x3c', + u'\xcb': '\x05\x6d', + u'\xca': '\x05\x3d', + u'\xcd': '\x05\x6e', + u'\xcc': '\x05\x3e', + u'\xcf': '\x05\x6f', + u'\xce': '\x05\x3f', + u'\xd1': '\x05\x70', + u'\xd0': '\x05\x40', + } diff --git a/odfrecode/recoders/cyrillic.py b/odfrecode/recoders/cyrillic.py new file mode 100644 index 0000000..7a3cdda --- /dev/null +++ b/odfrecode/recoders/cyrillic.py @@ -0,0 +1,151 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2007 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Cyrillic(recoder.Recoder): + """ + this class maps cyrillic characters from 0x80-0xff to unicode + the mapping table is based on code from VeraWeb's madlan converter + """ + + encoding = 'cyrillic' + dst_encoding = 'utf-16-be' + + character_table = { + u'\x7f': '\x04\x0c', + u'\x80': '\x04\x02', + u'\x81': '\x04\x03', + u'\x82': '\x20\x1a', + u'\x83': '\x04\x53', + u'\x84': '\x20\x1e', + u'\x85': '\x20\x26', + u'\x86': '\x20\x20', + u'\x87': '\x20\x21', + u'\x88': '\x00\x20', + u'\x89': '\x20\x30', + u'\x8a': '\x04\x09', + u'\x8b': '\x20\x39', + u'\x8c': '\x04\x0a', + u'\x8d': '\x04\x0c', + u'\x8e': '\x04\x0b', + u'\x8f': '\x04\x0f', + u'\x90': '\x04\x52', + u'\x91': '\x20\x18', + u'\x92': '\x20\x19', + u'\x93': '\x20\x1c', + u'\x94': '\x20\x1d', + u'\x95': '\x20\x22', + u'\x96': '\x20\x13', + u'\x97': '\x20\x14', + u'\x98': '\x00\x20', + u'\x99': '\x21\x22', + u'\x9a': '\x04\x59', + u'\x9b': '\x20\x3a', + u'\x9c': '\x04\x5a', + u'\x9d': '\x04\x5c', + u'\x9e': '\x04\x5b', + u'\x9f': '\x04\x5f', + u'\xa0': '\x00\x20', + u'\xa1': '\x04\x0e', + u'\xa2': '\x04\x5e', + u'\xa3': '\x04\x08', + u'\xa5': '\x04\x90', + u'\xa8': '\x04\x01', + u'\xaa': '\x04\x04', + u'\xaf': '\x04\x07', + u'\xb2': '\x04\x06', + u'\xb3': '\x04\x57', + u'\xb4': '\x04\x91', + u'\xb8': '\x04\x51', + u'\xb9': '\x21\x16', + u'\xba': '\x04\x54', + u'\xbc': '\x04\x58', + u'\xbd': '\x04\x05', + u'\xbe': '\x04\x55', + u'\xbf': '\x04\x57', + u'\xc0': '\x04\x10', + u'\xc1': '\x04\x11', + u'\xc2': '\x04\x12', + u'\xc3': '\x04\x13', + u'\xc4': '\x04\x14', + u'\xc5': '\x04\x15', + u'\xc6': '\x04\x16', + u'\xc7': '\x04\x17', + u'\xc8': '\x04\x18', + u'\xc9': '\x04\x19', + u'\xca': '\x04\x1a', + u'\xcb': '\x04\x1b', + u'\xcc': '\x04\x1c', + u'\xcd': '\x04\x1d', + u'\xce': '\x04\x1e', + u'\xcf': '\x04\x1f', + u'\xd0': '\x04\x20', + u'\xd1': '\x04\x21', + u'\xd2': '\x04\x22', + u'\xd3': '\x04\x23', + u'\xd4': '\x04\x24', + u'\xd5': '\x04\x25', + u'\xd6': '\x04\x26', + u'\xd7': '\x04\x27', + u'\xd8': '\x04\x28', + u'\xd9': '\x04\x29', + u'\xda': '\x04\x2a', + u'\xdb': '\x04\x2b', + u'\xdc': '\x04\x2c', + u'\xdd': '\x04\x2d', + u'\xde': '\x04\x2e', + u'\xdf': '\x04\x2f', + u'\xe0': '\x04\x30', + u'\xe1': '\x04\x31', + u'\xe2': '\x04\x32', + u'\xe3': '\x04\x33', + u'\xe4': '\x04\x34', + u'\xe5': '\x04\x35', + u'\xe6': '\x04\x36', + u'\xe7': '\x04\x37', + u'\xe8': '\x04\x38', + u'\xe9': '\x04\x39', + u'\xea': '\x04\x3a', + u'\xeb': '\x04\x3b', + u'\xec': '\x04\x3c', + u'\xed': '\x04\x3d', + u'\xee': '\x04\x3e', + u'\xef': '\x04\x3f', + u'\xf0': '\x04\x40', + u'\xf1': '\x04\x41', + u'\xf2': '\x04\x42', + u'\xf3': '\x04\x43', + u'\xf4': '\x04\x44', + u'\xf5': '\x04\x45', + u'\xf6': '\x04\x46', + u'\xf7': '\x04\x47', + u'\xf8': '\x04\x48', + u'\xf9': '\x04\x49', + u'\xfa': '\x04\x4a', + u'\xfb': '\x04\x4b', + u'\xfc': '\x04\x4c', + u'\xfd': '\x04\x4d', + u'\xfe': '\x04\x4e', + u'\xff': '\x04\x4f', + } + diff --git a/odfrecode/recoders/georgian.py b/odfrecode/recoders/georgian.py new file mode 100644 index 0000000..247cc5a --- /dev/null +++ b/odfrecode/recoders/georgian.py @@ -0,0 +1,65 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# (c) 2008 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Georgian(recoder.Recoder): + """this class maps georgian to unicode""" + + encoding = 'georgian' + dst_encoding = 'utf-8' + + character_table = { + u'\x43': 'ჩ', + u'\x4a': 'ჟ', + u'\x52': 'ღ', + u'\x53': 'შ', + u'\x54': 'თ', + u'\x57': 'ჭ', + u'\x5a': 'ძ', + u'\x61': 'ა', + u'\x62': 'ბ', + u'\x63': 'ც', + u'\x64': 'დ', + u'\x65': 'ე', + u'\x66': 'ფ', + u'\x67': 'გ', + u'\x68': 'ჰ', + u'\x69': 'ი', + u'\x6a': 'ჯ', + u'\x6b': 'კ', + u'\x6c': 'ლ', + u'\x6d': 'მ', + u'\x6e': 'ნ', + u'\x6f': 'ო', + u'\x70': 'პ', + u'\x71': 'ქ', + u'\x72': 'რ', + u'\x73': 'ს', + u'\x74': 'ტ', + u'\x75': 'უ', + u'\x76': 'ვ', + u'\x77': 'წ', + u'\x78': 'ხ', + u'\x79': 'ყ', + u'\x7a': 'ზ', + } diff --git a/odfrecode/recoders/greek.py b/odfrecode/recoders/greek.py new file mode 100644 index 0000000..7610338 --- /dev/null +++ b/odfrecode/recoders/greek.py @@ -0,0 +1,154 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2009 Guido Günther <agx@sigxcpu.org> +# (c) 2007 Torsten Werner <twerner@debian.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Greek(recoder.Recoder): + """ + this class maps greek characters from 0x80-0xff to unicode + the mapping table is based on code from VeraWeb's madlan converter + """ + + encoding = 'cyrillic' + dst_encoding = 'utf-16-be' + + character_table = { + u'\x82': '\x20\x1a', + u'\x83': '\x01\x92', + u'\x84': '\x20\x1e', + u'\x85': '\x20\x26', + u'\x86': '\x20\x20', + u'\x87': '\x20\x21', + u'\x88': '\x02\xc6', + u'\x89': '\x20\x30', + u'\x8a': '\x01\x60', + u'\x8b': '\x20\x39', + u'\x8c': '\x01\x52', + u'\x91': '\x20\x18', + u'\x92': '\x20\x19', + u'\x93': '\x20\x1c', + u'\x94': '\x20\x10', + u'\x95': '\x20\x22', + u'\x96': '\x20\x13', + u'\x97': '\x20\x14', + u'\x98': '\x02\xdc', + u'\x99': '\x21\x22', + u'\x9a': '\x01\x61', + u'\x9b': '\x20\x3a', + u'\x9c': '\x01\x53', + u'\x9f': '\x01\x78', + u'\xa1': '\x03\x85', + u'\xa2': '\x03\x86', + u'\xa3': '\x00\xa3', + u'\xa4': '\x00\xa4', + u'\xa5': '\x00\xa5', + u'\xa6': '\x00\xa6', + u'\xa7': '\x00\xa7', + u'\xa8': '\x00\xa8', + u'\xa9': '\x00\xa9', + u'\xaa': '\x00\xaa', + u'\xab': '\x00\xab', + u'\xac': '\x00\xac', + u'\xad': '\x00\xad', + u'\xae': '\x00\xae', + u'\xaf': '\x00\xaf', + u'\xb0': '\x00\xb0', + u'\xb1': '\x00\xb1', + u'\xb2': '\x00\xb2', + u'\xb3': '\x00\xb3', + u'\xb4': '\x00\xb4', + u'\xb5': '\x00\xb5', + u'\xb6': '\x00\xb6', + u'\xb7': '\x00\xb7', + u'\xb8': '\x03\x88', + u'\xb9': '\x03\x89', + u'\xba': '\x03\x8a', + u'\xbb': '\x00\xbb', + u'\xbc': '\x03\x8c', + u'\xbd': '\x00\xbd', + u'\xbe': '\x03\x8e', + u'\xbf': '\x03\x8f', + u'\xc0': '\x03\x90', + u'\xc1': '\x03\x91', + u'\xc2': '\x03\x92', + u'\xc3': '\x03\x93', + u'\xc4': '\x03\x94', + u'\xc5': '\x03\x95', + u'\xc6': '\x03\x96', + u'\xc7': '\x03\x97', + u'\xc8': '\x03\x98', + u'\xc9': '\x03\x99', + u'\xca': '\x03\x9a', + u'\xcb': '\x03\x9b', + u'\xcc': '\x03\x9c', + u'\xcd': '\x03\x9d', + u'\xce': '\x03\x9e', + u'\xcf': '\x03\x9f', + u'\xd0': '\x03\xa0', + u'\xd1': '\x03\xa1', + u'\xd2': '\x03\xda', + u'\xd3': '\x03\xa3', + u'\xd4': '\x03\xa4', + u'\xd5': '\x03\xa5', + u'\xd6': '\x03\xa6', + u'\xd7': '\x03\xa7', + u'\xd8': '\x03\xa8', + u'\xd9': '\x03\xa9', + u'\xda': '\x03\xaa', + u'\xdb': '\x03\xab', + u'\xdc': '\x03\xac', + u'\xdd': '\x03\xad', + u'\xde': '\x03\xae', + u'\xdf': '\x03\xaf', + u'\xe0': '\x03\xb0', + u'\xe1': '\x03\xb1', + u'\xe2': '\x03\xb2', + u'\xe3': '\x03\xb3', + u'\xe4': '\x03\xb4', + u'\xe5': '\x03\xb5', + u'\xe6': '\x03\xb6', + u'\xe7': '\x03\xb7', + u'\xe8': '\x03\xb8', + u'\xe9': '\x03\xb9', + u'\xea': '\x03\xba', + u'\xeb': '\x03\xbb', + u'\xec': '\x03\xbc', + u'\xed': '\x03\xbd', + u'\xee': '\x03\xbe', + u'\xef': '\x03\xbf', + u'\xf0': '\x03\xc0', + u'\xf1': '\x03\xc1', + u'\xf2': '\x03\xc2', + u'\xf3': '\x03\xc3', + u'\xf4': '\x03\xc4', + u'\xf5': '\x03\xc5', + u'\xf6': '\x03\xc6', + u'\xf7': '\x03\xc7', + u'\xf8': '\x03\xc8', + u'\xf9': '\x03\xc9', + u'\xfa': '\x03\xca', + u'\xfb': '\x03\xcb', + u'\xfc': '\x03\xcc', + u'\xfd': '\x03\xce', + u'\xfe': '\x03\xce', + } + diff --git a/odfrecode/recoders/recoder.py b/odfrecode/recoders/recoder.py new file mode 100644 index 0000000..01a3c1a --- /dev/null +++ b/odfrecode/recoders/recoder.py @@ -0,0 +1,45 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2007,2008 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +class Recoder(object): + """ + base class for all encoders + @classvar dst_encoding: encoding of the destination "column" + @classvar encoding: name of the encoding + + To write a new encoder simply add a dictionary + + >>> character_table = { "national_encoding_1": "utf8_1", + national_encoding_2": "utf8_2", + } + + If you don't want to use utf8 as dictionary values specify dst_encoding + """ + dst_encoding = 'utf-8' + encoding = None + + def recode(self, character): + try: + char = unicode(self.character_table[character], self.dst_encoding) + except KeyError: # needs no remapping + char = character + return char + diff --git a/odfrecode/recoders/romanian.py b/odfrecode/recoders/romanian.py new file mode 100644 index 0000000..4f07e9d --- /dev/null +++ b/odfrecode/recoders/romanian.py @@ -0,0 +1,62 @@ +# vim:encoding=utf-8:fileencoding=utf-8 +# +# odfrecode +# +# (c) 2010 Guido Günther <agx@sigxcpu.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# + +import recoder + +class Romanian(recoder.Recoder): + """ + this class maps Romanian to unicode + See https://bugzilla.redhat.com/show_bug.cgi?id=327501 + and http://en.wikipedia.org/wiki/Romanian_alphabet for details. + """ + + encoding = 'romanian' + dst_encoding = 'utf-8' + + character_table = { + # "S with comma below" (Unicode 0218) - incorrectly implemented as + # "S with cedilla below" (Unicode 015E) + u'Ş': 'Ș', + # "s with comma below" (Unicode 0219) - incorrectly implemented as + # "s with cedilla below" (Unicode 015F) + u'ş': 'ș', + # "T with comma below" (Unicode 021A) - incorrectly implemented as + # "T with cedilla below" (Unicode 0162) + u'Ţ': 'Ț', + # "t with comma below" (Unicode 021B) - incorrectly implemented as + # "t with cedilla below" (Unicode 0163) + u'ţ': 'ț', + # Furthermore the Microsoft's EasternRoman Font has mappings from + # "Latin-1 supplement" of these characters: + # S with comma below at 0xaa + u'ª': 'Ș', + # s with comma below at 0xba + u'º': 'ș', + # T with comma below at 0xde: + u'Þ': 'Ț', + # t with comma below at 0xfe: + u'þ': 'ț', + # A with breve at 0xc3: + u'Ã': 'Ă', + # a with breve at 0xe3: + u'ã': 'ă', + } + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..e90c56f --- /dev/null +++ b/setup.py @@ -0,0 +1,13 @@ +# vim:encoding=utf-8:fileencoding=utf-8 + +from distutils.core import setup + +setup(name="odfrecode", + version="0.0.1", + author="Guido Günther", + author_email="agx@sigxcpu.org", + scripts = [ "bin/odfrecode", "bin/odfrecode-gtk" ], + packages =[ "odfrecode", + "odfrecode.recoders", + ]) + |