Logo Search packages:      
Sourcecode: calibre version File versions  Download package

hex_2_utf8.py

#########################################################################
#                                                                       #
#                                                                       #
#   copyright 2002 Paul Henry Tremblay                                  #
#                                                                       #
#   This program is distributed in the hope that it will be useful,     #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
#   General Public License for more details.                            #
#                                                                       #
#   You should have received a copy of the GNU General Public License   #
#   along with this program; if not, write to the Free Software         #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
#   02111-1307 USA                                                      #
#                                                                       #
#                                                                       #
#########################################################################
import sys, os, tempfile, cStringIO
from calibre.ebooks.rtf2xml import get_char_map, copy
from calibre.ebooks.rtf2xml.char_set import char_set
00021 class Hex2Utf8:
    """
    Convert Microsoft hexidecimal numbers to utf-8
    """
00025     def __init__(self,
            in_file,
            area_to_convert,
            char_file,
            default_char_map,
            bug_handler,
            invalid_rtf_handler,
            copy=None,
            temp_dir=None,
            symbol = None,
            wingdings = None,
            caps = None,
            convert_caps = None,
            dingbats = None,
            run_level = 1,
            ):
        """
        Required:
            'file'
            'area_to_convert'--the area of file to convert
            'char_file'--the file containing the character mappings
            'default_char_map'--name of default character map
        Optional:
            'copy'-- whether to make a copy of result for debugging
            'temp_dir' --where to output temporary results (default is
            directory from which the script is run.)
            'symbol'--whether to load the symbol character map
            'winddings'--whether to load the wingdings character map
            'caps'--whether to load the caps characer map
            'convert_to_caps'--wether to convert caps to utf-8
        Returns:
            nothing
            """
        self.__file = in_file
        self.__copy = copy
        if area_to_convert != 'preamble' and area_to_convert != 'body':
            msg = (
            'Developer error! Wrong flag.\n'
            'in module "hex_2_utf8.py\n'
            '"area_to_convert" must be "body" or "preamble"\n'
            )
            raise self.__bug_handler, msg
        self.__char_file = char_file
        self.__area_to_convert = area_to_convert
        self.__default_char_map = default_char_map
        self.__symbol = symbol
        self.__wingdings = wingdings
        self.__dingbats = dingbats
        self.__caps = caps
        self.__convert_caps = 0
        self.__convert_symbol = 0
        self.__convert_wingdings = 0
        self.__convert_zapf = 0
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
        self.__bug_handler = bug_handler
        self.__invalid_rtf_handler = invalid_rtf_handler
00082     def update_values(  self,
                        file,
                        area_to_convert,
                        char_file,
                        convert_caps,
                        convert_symbol,
                        convert_wingdings,
                        convert_zapf,
                        copy=None,
                        temp_dir=None,
                        symbol = None,
                        wingdings = None,
                        caps = None,
                        dingbats = None,
                    ):
        """
        Required:
            'file'
            'area_to_convert'--the area of file to convert
            'char_file'--the file containing the character mappings
        Optional:
            'copy'-- whether to make a copy of result for debugging
            'temp_dir' --where to output temporary results (default is
            directory from which the script is run.)
            'symbol'--whether to load the symbol character map
            'winddings'--whether to load the wingdings character map
            'caps'--whether to load the caps characer map
            'convert_to_caps'--wether to convert caps to utf-8
        Returns:
            nothing
            """
        self.__file=file
        self.__copy = copy
        if area_to_convert != 'preamble' and area_to_convert != 'body':
            msg = (
            'in module "hex_2_utf8.py\n'
            '"area_to_convert" must be "body" or "preamble"\n'
            )
            raise self.__bug_handler, msg
        self.__area_to_convert = area_to_convert
        self.__symbol = symbol
        self.__wingdings = wingdings
        self.__dingbats = dingbats
        self.__caps = caps
        self.__convert_caps = convert_caps
        self.__convert_symbol = convert_symbol
        self.__convert_wingdings = convert_wingdings
        self.__convert_zapf = convert_zapf
        # new!
        # no longer try to convert these
        # self.__convert_symbol = 0
        # self.__convert_wingdings = 0
        # self.__convert_zapf = 0
00135     def __initiate_values(self):
        """
        Required:
            Nothing
        Set values, including those for the dictionaries.
        The file that contains the maps is broken down into many different
        sets. For example, for the Symbol font, there is the standard part for
        hexidecimal numbers, and the the part for Microsoft charcters. Read
        each part in, and then combine them.
        """
        # the default encoding system, the lower map for characters 0 through
        # 128, and the encoding system for Microsoft characters.
        # New on 2004-05-8: the self.__char_map is not in diretory with other
        # modules
        self.__char_file = cStringIO.StringIO(char_set)
        char_map_obj =  get_char_map.GetCharMap(
                char_file = self.__char_file,
                bug_handler = self.__bug_handler,
                )
        up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
        bt_128_dict = char_map_obj.get_char_map(map = 'bottom_128')
        ms_standard_dict = char_map_obj.get_char_map(map = 'ms_standard')
        self.__def_dict = {}
        self.__def_dict.update(up_128_dict)
        self.__def_dict.update(bt_128_dict)
        self.__def_dict.update(ms_standard_dict)
        self.__current_dict = self.__def_dict
        self.__current_dict_name = 'default'
        self.__in_caps = 0
        self.__special_fonts_found = 0
        if self.__symbol:
            symbol_base_dict = char_map_obj.get_char_map(map = 'SYMBOL')
            ms_symbol_dict = char_map_obj.get_char_map(map = 'ms_symbol')
            self.__symbol_dict = {}
            self.__symbol_dict.update(symbol_base_dict)
            self.__symbol_dict.update(ms_symbol_dict)
        if self.__wingdings:
            wingdings_base_dict = char_map_obj.get_char_map(map = 'wingdings')
            ms_wingdings_dict = char_map_obj.get_char_map(map = 'ms_wingdings')
            self.__wingdings_dict = {}
            self.__wingdings_dict.update(wingdings_base_dict)
            self.__wingdings_dict.update(ms_wingdings_dict)
        if self.__dingbats:
            dingbats_base_dict = char_map_obj.get_char_map(map = 'dingbats')
            ms_dingbats_dict = char_map_obj.get_char_map(map = 'ms_dingbats')
            self.__dingbats_dict = {}
            self.__dingbats_dict.update(dingbats_base_dict)
            self.__dingbats_dict.update(ms_dingbats_dict)
        # load dictionary for caps, and make a string for the replacement
        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
        # # print self.__caps_uni_dict
        # don't think I'll need this
        ##keys = self.__caps_uni_dict.keys()
        ##self.__caps_uni_replace = '|'.join(keys)
00189         self.__preamble_state_dict = {
            'preamble'      :       self.__preamble_func,
            'body'          :       self.__body_func,
            'mi<mk<body-open_'  :   self.__found_body_func,
            'tx<hx<__________'  :   self.__hex_text_func,
            }
        self.__body_state_dict = {
            'preamble'      :       self.__preamble_for_body_func,
            'body'          :       self.__body_for_body_func,
            }
        self.__in_body_dict = {
            'mi<mk<body-open_'  :   self.__found_body_func,
            'tx<ut<__________'  :   self.__utf_to_caps_func,
            'tx<hx<__________'  :   self.__hex_text_func,
            'tx<mc<__________'  :   self.__hex_text_func,
            'tx<nu<__________'  :   self.__text_func,
            'mi<mk<font______'  :   self.__start_font_func,
            'mi<mk<caps______'  :   self.__start_caps_func,
            'mi<mk<font-end__'  :   self.__end_font_func,
            'mi<mk<caps-end__'  :   self.__end_caps_func,
        }
        self.__caps_list = ['false']
        self.__font_list = ['not-defined']
00212     def __hex_text_func(self, line):
        """
        Required:
            'line' -- the line
        Logic:
            get the hex_num and look it up in the default dictionary. If the
            token is in the dictionary, then check if the value starts with a
            "&". If it does, then tag the result as utf text. Otherwise, tag it
            as normal text.
            If the nex_num is not in the dictionary, then a mistake has been
            made.
            """
        hex_num = line[17:-1]
        converted = self.__current_dict.get(hex_num)
        if converted != None:
            # tag as utf-8
            if converted[0:1] == "&":
                font = self.__current_dict_name
                if self.__convert_caps\
                and self.__caps_list[-1] == 'true'\
                and font != 'Symbol'\
                and font != 'Wingdings'\
                and font != 'Zapf Dingbats':
                    converted = self.__utf_token_to_caps_func(converted)
                self.__write_obj.write(
                'tx<ut<__________<%s\n' % converted
                )
            # tag as normal text
            else:
                font = self.__current_dict_name
                if self.__convert_caps\
                and self.__caps_list[-1] == 'true'\
                and font != 'Symbol'\
                and font != 'Wingdings'\
                and font != 'Zapf Dingbats':
                    converted = converted.upper()
                self.__write_obj.write(
                'tx<nu<__________<%s\n' % converted
                )
        # error
        else:
            token = hex_num.replace("'", '')
            the_num = 0
            if  token:
                the_num = int(token, 16)
            if the_num > 10:
                self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
                    hex_num)
                if self.__run_level > 4:
                    # msg = 'no dictionary entry for %s\n'
                    # msg += 'the hexidecimal num is "%s"\n' % (hex_num)
                    # msg += 'dictionary is %s\n' % self.__current_dict_name
                    msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
                    raise self.__bug_handler, msg
    def __found_body_func(self, line):
        self.__state = 'body'
        self.__write_obj.write(line)
00269     def __body_func(self, line):
        """
        When parsing preamble
        """
        self.__write_obj.write(line)
    def __preamble_func(self, line):
        action = self.__preamble_state_dict.get(self.__token_info)
        if action != None:
            action(line)
        else:
            self.__write_obj.write(line)
    def __convert_preamble(self):
        self.__state = 'preamble'
        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
        line_to_read = 1
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            self.__token_info = line[:16]
            action = self.__preamble_state_dict.get(self.__state)
            if action == None:
                sys.stderr.write('error no state found in hex_2_utf8',
                self.__state
                )
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
00302     def __preamble_for_body_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            Used when parsing the body.
        """
        if self.__token_info == 'mi<mk<body-open_':
            self.__found_body_func(line)
        self.__write_obj.write(line)
00314     def __body_for_body_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            Used when parsing the body.
        """
        action = self.__in_body_dict.get(self.__token_info)
        if action != None:
            action(line)
        else:
            self.__write_obj.write(line)
00328     def __start_font_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            add font face to font_list
        """
        face = line[17:-1]
        self.__font_list.append(face)
        if face == 'Symbol' and self.__convert_symbol:
            self.__current_dict_name = 'Symbol'
            self.__current_dict = self.__symbol_dict
        elif face == 'Wingdings' and self.__convert_wingdings:
            self.__current_dict_name = 'Wingdings'
            self.__current_dict = self.__wingdings_dict
        elif face == 'Zapf Dingbats' and self.__convert_zapf:
            self.__current_dict_name = 'Zapf Dingbats'
            self.__current_dict = self.__dingbats_dict
        else:
            self.__current_dict_name = 'default'
            self.__current_dict = self.__def_dict
00351     def __end_font_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            pop font_list
        """
        if len(self.__font_list) > 1:
            self.__font_list.pop()
        else:
            sys.stderr.write('module is hex_2_utf8\n')
            sys.stderr.write('method is end_font_func\n')
            sys.stderr.write('self.__font_list should be greater than one?\n')
        face = self.__font_list[-1]
        if face == 'Symbol' and self.__convert_symbol:
            self.__current_dict_name = 'Symbol'
            self.__current_dict = self.__symbol_dict
        elif face == 'Wingdings' and self.__convert_wingdings:
            self.__current_dict_name = 'Wingdings'
            self.__current_dict = self.__wingdings_dict
        elif face == 'Zapf Dingbats' and self.__convert_zapf:
            self.__current_dict_name = 'Zapf Dingbats'
            self.__current_dict = self.__dingbats_dict
        else:
            self.__current_dict_name = 'default'
            self.__current_dict = self.__def_dict
00379     def __start_special_font_func_old(self, line):
        """
        Required:
            line -- line
        Returns;
            nothing
        Logic:
            change the dictionary to use in conversion
        """
        # for error checking
        if self.__token_info == 'mi<mk<font-symbo':
            self.__current_dict.append(self.__symbol_dict)
            self.__special_fonts_found += 1
            self.__current_dict_name = 'Symbol'
        elif self.__token_info == 'mi<mk<font-wingd':
            self.__special_fonts_found += 1
            self.__current_dict.append(self.__wingdings_dict)
            self.__current_dict_name = 'Wingdings'
        elif self.__token_info == 'mi<mk<font-dingb':
            self.__current_dict.append(self.__dingbats_dict)
            self.__special_fonts_found += 1
            self.__current_dict_name = 'Zapf Dingbats'
00401     def __end_special_font_func(self, line):
        """
        Required:
            line --line to parse
        Returns:
            nothing
        Logic:
            pop the last dictionary, which should be a special font
        """
        if len(self.__current_dict) < 2:
            sys.stderr.write('module is hex_2_utf 8\n')
            sys.stderr.write('method is __end_special_font_func\n')
            sys.stderr.write('less than two dictionaries --can\'t pop\n')
            self.__special_fonts_found -= 1
        else:
            self.__current_dict.pop()
            self.__special_fonts_found -= 1
            self.__dict_name = 'default'
00419     def __start_caps_func_old(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            A marker that marks the start of caps has been found. Set
            self.__in_caps to 1
        """
        self.__in_caps = 1
00430     def __start_caps_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            A marker that marks the start of caps has been found. Set
            self.__in_caps to 1
        """
        self.__in_caps = 1
        value = line[17:-1]
        self.__caps_list.append(value)
00443     def __end_caps_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            A marker that marks the end of caps has been found.
            set self.__in_caps to 0
        """
        if len(self.__caps_list) > 1:
            self.__caps_list.pop()
        else:
            sys.stderr.write('Module is hex_2_utf8\n')
            sys.stderr.write('method is __end_caps_func\n')
            sys.stderr.write('caps list should be more than one?\n')
00459     def __text_func(self, line):
        """
        Required:
            line -- line to parse
        Returns:
            nothing
        Logic:
            if in caps, convert. Otherwise, print out.
        """
        text = line[17:-1]
        if self.__current_dict_name == 'Symbol'\
          or self.__current_dict_name == 'Wingdings'\
          or self.__current_dict_name == 'Zapf Dingbats':
            the_string = ''
            for letter in text:
                hex_num = hex(ord(letter))
                hex_num = str(hex_num)
                hex_num = hex_num.upper()
                hex_num = hex_num[2:]
                hex_num = '\'%s' % hex_num
                converted = self.__current_dict.get(hex_num)
                if converted == None:
                    sys.stderr.write('module is hex_2_ut8\n')
                    sys.stderr.write('method is __text_func\n')
                    sys.stderr.write('no hex value for "%s"\n' % hex_num)
                else:
                    the_string += converted
            self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
        else:
            if self.__caps_list[-1] == 'true' \
                and self.__convert_caps\
                and self.__current_dict_name != 'Symbol'\
                and self.__current_dict_name != 'Wingdings'\
                and self.__current_dict_name != 'Zapf Dingbats':
                text = text.upper()
            self.__write_obj.write('tx<nu<__________<%s\n' % text)
00495     def __utf_to_caps_func(self, line):
        """
        Required:
            line -- line to parse
        returns
            nothing
        Logic
            Get the text, and use another method to convert
        """
        utf_text = line[17:-1]
        if self.__caps_list[-1] == 'true' and self.__convert_caps:
            # utf_text = utf_text.upper()
            utf_text = self.__utf_token_to_caps_func(utf_text)
        self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
00509     def __utf_token_to_caps_func(self, char_entity):
        """
        Required:
            utf_text -- such as &xxx;
        Returns:
            token converted to the capital equivalent
        Logic:
            RTF often stores text in the improper values. For example, a
            capital umlaut o (?), is stores as ?. This function swaps the
            case by looking up the value in a dictionary.
        """
        hex_num = char_entity[3:]
        length = len(hex_num)
        if length == 3:
            hex_num = '00%s' % hex_num
        elif length == 4:
            hex_num = '0%s' % hex_num
        new_char_entity = '&#x%s' % hex_num
        converted = self.__caps_uni_dict.get(new_char_entity)
        if not converted:
            # bullets and other entities dont' have capital equivelents
            return char_entity
        else:
            return converted
    def __convert_body(self):
        self.__state = 'body'
        read_obj = open(self.__file, 'r')
        self.__write_obj = open(self.__write_to, 'w')
        line_to_read = 1
        while line_to_read:
            line_to_read = read_obj.readline()
            line = line_to_read
            self.__token_info = line[:16]
            action = self.__body_state_dict.get(self.__state)
            if action == None:
                sys.stderr.write('error no state found in hex_2_utf8',
                self.__state
                )
            action(line)
        read_obj.close()
        self.__write_obj.close()
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
    def convert_hex_2_utf8(self):
        self.__initiate_values()
        if self.__area_to_convert == 'preamble':
            self.__convert_preamble()
        else:
            self.__convert_body()
"""
how to swap case for non-capitals
my_string.swapcase()
An example of how to use a hash for the caps function
(but I shouldn't need this, since utf text is separate
 from regular text?)
sub_dict = {
    "&#x0430;"   : "some other value"
    }
def my_sub_func(matchobj):
    info =  matchobj.group(0)
    value = sub_dict.get(info)
    return value
    return "f"
line = "&#x0430; more text"
reg_exp = re.compile(r'(?P<name>&#x0430;|&#x0431;)')
line2 = re.sub(reg_exp, my_sub_func, line)
print line2
"""

Generated by  Doxygen 1.6.0   Back to index