Logo Search packages:      
Sourcecode: calibre version File versions  Download package

process_tokens.py

#########################################################################
#                                                                       #
#                                                                       #
#   copyright 2002 Paul Henry Tremblay                                  #
#                                                                       #
#   This program is distributed in the hope that it will be useful,     #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
#   General Public License for more details.                            #
#                                                                       #
#   You should have received a copy of the GNU General Public License   #
#   along with this program; if not, write to the Free Software         #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA            #
#   02111-1307 USA                                                      #
#                                                                       #
#                                                                       #
#########################################################################
import os, re,  tempfile
from calibre.ebooks.rtf2xml import copy, check_brackets
00020 class ProcessTokens:
    """
    Process each token on a line and add information that will be useful for
    later processing. Information will be put on one line, delimited by "<"
    for main fields, and ">" for sub fields
    """
    def __init__(self,
            in_file,
            exception_handler,
            bug_handler,
            copy = None,
            run_level = 1,
            ):
        self.__file = in_file
        self.__bug_handler = bug_handler
00035         self.__copy = copy
        self.__run_level = run_level
        self.__write_to = tempfile.mktemp()
        self.initiate_token_dict()
        ##self.initiate_token_actions()
        self.compile_expressions()
        self.__bracket_count=0
        self.__exception_handler = exception_handler
        self.__bug_handler = bug_handler
    def compile_expressions(self):
        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
        self.__utf_exp = re.compile(r'(&.*?;)')
    def initiate_token_dict(self):
        self.__return_code = 0
        self.dict_token={
        # unicode
        'mshex'             :   ('nu', '__________', self.__ms_hex_func),
        # brackets
        '{'                  :      ('nu', '{', self.ob_func),
        '}'                  :      ('nu', '}', self.cb_func),
        # microsoft characters
        'ldblquote'          :      ('mc', 'ldblquote', self.ms_sub_func),
        'rdblquote'          :      ('mc', 'rdblquote', self.ms_sub_func),
        'rquote'             :      ('mc', 'rquote', self.ms_sub_func),
        'lquote'             :      ('mc', 'lquote', self.ms_sub_func),
        'emdash'             :      ('mc', 'emdash', self.ms_sub_func),
        'endash'             :      ('mc', 'endash', self.ms_sub_func),
        'bullet'             :      ('mc', 'bullet', self.ms_sub_func),
        '~'                  :      ('mc', '~', self.ms_sub_func),
        'tab'                :      ('mc', 'tab', self.ms_sub_func),
        '_'                  :      ('mc', '_', self.ms_sub_func),
        ';'                  :      ('mc', ';', self.ms_sub_func),
        # this must be wrong
        '-'                  :      ('mc', '-', self.ms_sub_func),
        # misc => ml
        '*'                  :      ('ml', 'asterisk__', self.default_func),
        ':'                  :      ('ml', 'colon_____', self.default_func),
        # text
        'backslash'          :      ('nu', '\\', self.text_func),
        'ob'                 :      ('nu', '{', self.text_func),
        'cb'                 :      ('nu', '}', self.text_func),
        'line'               :  ('nu', ' ', self.text_func),
        # paragraph formatting => pf
        'page'               :  ('pf', 'page-break', self.default_func),
        'par'                :      ('pf', 'par-end___', self.default_func),
        'pard'               :      ('pf', 'par-def___', self.default_func),
        'keepn'              :      ('pf', 'keep-w-nex', self.bool_st_func),
        'widctlpar'          :      ('pf', 'widow-cntl', self.bool_st_func),
        'adjustright'        :      ('pf', 'adjust-rgt', self.bool_st_func),
        'lang'               :      ('pf', 'language__', self.__language_func),
        'ri'                 :      ('pf', 'right-inde', self.divide_by_20),
        'fi'                 :      ('pf', 'fir-ln-ind', self.divide_by_20),
        'li'                 :      ('pf', 'left-inden', self.divide_by_20),
        'sb'                 :      ('pf', 'space-befo', self.divide_by_20),
        'sa'                 :      ('pf', 'space-afte', self.divide_by_20),
        'sl'                 :      ('pf', 'line-space', self.divide_by_20),
        'deftab'             :      ('pf', 'default-ta', self.divide_by_20),
        'ql'                 :      ('pf', 'align_____<left', self.two_part_func),
        'qc'                 :      ('pf', 'align_____<cent', self.two_part_func),
        'qj'                 :      ('pf', 'align_____<just', self.two_part_func),
        'qr'                 :      ('pf', 'align_____<right', self.two_part_func),
        'nowidctlpar'        :      ('pf', 'widow-cntr<false', self.two_part_func),
        'tx'                 :  ('pf', 'tab-stop__', self.divide_by_20),
        'tb'                 :  ('pf', 'tab-bar-st', self.divide_by_20),
        'tqr'                :  ('pf', 'tab-right_', self.default_func),
        'tqdec'              :  ('pf', 'tab-dec___', self.default_func),
        'tqc'                :  ('pf', 'tab-center', self.default_func),
        'tlul'               :  ('pf', 'leader-und', self.default_func),
        'tlhyph'             :  ('pf', 'leader-hyp', self.default_func),
        'tldot'              :  ('pf', 'leader-dot', self.default_func),
        # stylesheet = > ss
        'stylesheet'         :      ('ss', 'style-shet', self.default_func),
        'sbasedon'           :      ('ss', 'based-on__', self.default_func),
        'snext'              :      ('ss', 'next-style', self.default_func),
        'cs'                 :      ('ss', 'char-style', self.default_func),
        's'                  :      ('ss', 'para-style', self.default_func),
        # graphics => gr
        'pict'               :      ('gr', 'picture___', self.default_func),
        'objclass'           :      ('gr', 'obj-class_', self.default_func),
        'macpict'            :      ('gr', 'mac-pic___', self.default_func),
        # section => sc
        'sect'               :      ('sc', 'section___', self.default_func),
        'sectd'              :      ('sc', 'sect-defin', self.default_func),
        'endhere'            :      ('sc', 'sect-note_', self.default_func),
        # list=> ls
        'pntext'             :      ('ls', 'list-text_', self.default_func),
        # this line must be wrong because it duplicates an earlier one
        'listtext'           :      ('ls', 'list-text_', self.default_func),
        'pn'                 :      ('ls', 'list______', self.default_func),
        'pnseclvl'           :      ('ls', 'list-level', self.default_func),
        'pncard'             :      ('ls', 'list-cardi', self.bool_st_func),
        'pndec'              :      ('ls', 'list-decim', self.bool_st_func),
        'pnucltr'            :      ('ls', 'list-up-al', self.bool_st_func),
        'pnucrm'             :      ('ls', 'list-up-ro', self.bool_st_func),
        'pnord'              :      ('ls', 'list-ord__', self.bool_st_func),
        'pnordt'             :      ('ls', 'list-ordte', self.bool_st_func),
        'pnlvlblt'           :      ('ls', 'list-bulli', self.bool_st_func),
        'pnlvlbody'          :      ('ls', 'list-simpi', self.bool_st_func),
        'pnlvlcont'          :      ('ls', 'list-conti', self.bool_st_func),
        'pnhang'             :      ('ls', 'list-hang_', self.bool_st_func),
        'pntxtb'             :      ('ls', 'list-tebef', self.bool_st_func),
        'ilvl'               :      ('ls', 'list-level', self.default_func),
        'ls'                 :      ('ls', 'list-id___', self.default_func),
        'pnstart'            :      ('ls', 'list-start', self.default_func),
        'itap'               :      ('ls', 'nest-level', self.default_func),
        'leveltext'          :  ('ls', 'level-text', self.default_func),
        'levelnumbers'       :  ('ls', 'level-numb', self.default_func),
        'list'               :  ('ls', 'list-in-tb', self.default_func),
        'listlevel'          :  ('ls', 'list-tb-le', self.default_func),
        'listname'           :  ('ls', 'list-name_', self.default_func),
        'listtemplateid'     :  ('ls', 'ls-tem-id_', self.default_func),
        'leveltemplateid'    :  ('ls', 'lv-tem-id_', self.default_func),
        'listhybrid'         :  ('ls', 'list-hybri', self.default_func),
        'levelstartat'       :  ('ls', 'level-star', self.default_func),
        'levelspace'         :  ('ls', 'level-spac', self.divide_by_20),
        'levelindent'        :  ('ls', 'level-inde', self.default_func),
        'levelnfc'           :  ('ls', 'level-type', self.__list_type_func),
        'levelnfcn'          :  ('ls', 'level-type', self.__list_type_func),
        'listid'             :  ('ls', 'lis-tbl-id',  self.default_func),
        'listoverride'       :  ('ls', 'lis-overid', self.default_func),
        # duplicate
        'pnlvl'              :      ('ls', 'list-level', self.default_func),
        # root info => ri
        'rtf'                :      ('ri', 'rtf_______', self.default_func),
        'deff'               :      ('ri', 'deflt-font', self.default_func),
        'mac'                :      ('ri', 'macintosh_', self.default_func),
        'ansi'               :      ('ri', 'ansi______', self.default_func),
        'ansicpg'            :      ('ri', 'ansi-codpg', self.default_func),
        # notes => nt
        'footnote'           :      ('nt', 'footnote__', self.default_func),
        'ftnalt'             :      ('nt', 'type______<endnote', self.two_part_func),
        # anchor => an
        'tc'                : ('an', 'toc_______', self.default_func),
        'bkmkstt'            :      ('an', 'book-mk-st', self.default_func),
        'bkmkstart'         : ('an', 'book-mk-st', self.default_func),
        'bkmkend'            :      ('an', 'book-mk-en', self.default_func),
        'xe'                 :      ('an', 'index-mark', self.default_func),
        'rxe'                :      ('an', 'place_____', self.default_func),
        # index => in
        'bxe'                :      ('in', 'index-bold', self.default_func),
        'ixe'                :      ('in', 'index-ital', self.default_func),
        'txe'                :      ('in', 'index-see_', self.default_func),
        # table of contents => tc
        'tcl'               :   ('tc', 'toc-level_', self.default_func),
        'tcn'               :   ('tc', 'toc-sup-nu', self.default_func),
        # field => fd
        'field'              :      ('fd', 'field_____', self.default_func),
        'fldinst'            :      ('fd', 'field-inst', self.default_func),
        'fldrslt'            :      ('fd', 'field-rslt', self.default_func),
        'datafield'          :      ('fd', 'datafield_', self.default_func),
        # info-tables => it
        'fonttbl'            :      ('it', 'font-table', self.default_func),
        'colortbl'           :      ('it', 'colr-table', self.default_func),
        'listoverridetable'  :      ('it', 'lovr-table', self.default_func),
        'listtable'          :      ('it', 'listtable_', self.default_func),
        'revtbl'             :      ('it', 'revi-table', self.default_func),
        # character info => ci
        'b'                  :      ('ci', 'bold______', self.bool_st_func),
        'blue'               :      ('ci', 'blue______', self.color_func),
        'caps'               :  ('ci', 'caps______', self.bool_st_func),
        'cf'                 :      ('ci', 'font-color', self.default_func),
        'chftn'              :      ('ci', 'footnot-mk', self.bool_st_func),
        'dn'                 :      ('ci', 'font-down_', self.divide_by_2),
        'embo'               :      ('ci', 'emboss____', self.bool_st_func),
        'f'                  :      ('ci', 'font-style', self.default_func),
        'fs'                 :      ('ci', 'font-size_', self.divide_by_2),
        'green'              :      ('ci', 'green_____', self.color_func),
        'i'                  :      ('ci', 'italics___', self.bool_st_func),
        'impr'               :      ('ci', 'engrave___', self.bool_st_func),
        'outl'               :      ('ci', 'outline___', self.bool_st_func),
        'plain'              :      ('ci', 'plain_____', self.bool_st_func),
        'red'                :      ('ci', 'red_______', self.color_func),
        'scaps'              :      ('ci', 'small-caps', self.bool_st_func),
        'shad'               :      ('ci', 'shadow____', self.bool_st_func),
        'strike'             :      ('ci', 'strike-thr', self.bool_st_func),
        'striked'            :      ('ci', 'dbl-strike', self.bool_st_func),
        'sub'                :      ('ci', 'subscript_', self.bool_st_func),
        'super'              :      ('ci', 'superscrip', self.bool_st_func),
        'nosupersub'         :      ('ci', 'no-su-supe', self.__no_sup_sub_func),
        'up'                 :      ('ci', 'font-up___', self.divide_by_2),
        'v'                  :      ('ci', 'hidden____', self.default_func),
        #  table => tb
        'trowd'              :      ('tb', 'row-def___', self.default_func),
        'cell'               :      ('tb', 'cell______', self.default_func),
        'row'                :      ('tb', 'row_______', self.default_func),
        'intbl'              :      ('tb', 'in-table__', self.default_func),
        'cols'               :      ('tb', 'columns___', self.default_func),
        'trleft'             :      ('tb', 'row-pos-le', self.divide_by_20),
        'cellx'              :      ('tb', 'cell-posit', self.divide_by_20),
        'trhdr'              :  ('tb', 'row-header', self.default_func),
        # preamble => pr
        # document information => di
        'info'               :      ('di', 'doc-info__', self.default_func),
        'author'             :      ('di', 'author____', self.default_func),
        'operator'           :      ('di', 'operator__', self.default_func),
        'title'              :      ('di', 'title_____', self.default_func),
        'keywords'           :  ('di', 'keywords__', self.default_func),
        'doccomm'            :  ('di', 'doc-notes_', self.default_func),
        'comment'            :  ('di', 'doc-notes_', self.default_func),
        'subject'            :  ('di', 'subject___', self.default_func),
        'creatim'            :      ('di', 'create-tim', self.default_func),
        'yr'                 :      ('di', 'year______', self.default_func),
        'mo'                 :      ('di', 'month_____', self.default_func),
        'dy'                 :      ('di', 'day_______', self.default_func),
        'min'                :      ('di', 'minute____', self.default_func),
        'revtim'             :      ('di', 'revis-time', self.default_func),
        'nofwords'           :      ('di', 'num-of-wor', self.default_func),
        'nofchars'           :      ('di', 'num-of-chr', self.default_func),
        'nofpages'           :      ('di', 'num-of-pag', self.default_func),
        'edmins'             :      ('di', 'edit-time_', self.default_func),
        # headers and footers => hf
        'headerf'            :      ('hf', 'head-first', self.default_func),
        'headerl'            :      ('hf', 'head-left_', self.default_func),
        'headerr'            :      ('hf', 'head-right', self.default_func),
        'footerf'            :      ('hf', 'foot-first', self.default_func),
        'footerl'            :      ('hf', 'foot-left_', self.default_func),
        'footerr'            :      ('hf', 'foot-right', self.default_func),
        'header'             :      ('hf', 'header____', self.default_func),
        'footer'             :      ('hf', 'footer____', self.default_func),
        # page => pa
        'margl'              :      ('pa', 'margin-lef', self.divide_by_20),
        'margr'              :      ('pa', 'margin-rig', self.divide_by_20),
        'margb'              :      ('pa', 'margin-bot', self.divide_by_20),
        'margt'              :      ('pa', 'margin-top', self.divide_by_20),
        'gutter'             :      ('pa', 'gutter____', self.divide_by_20),
        'paperw'             :      ('pa', 'paper-widt', self.divide_by_20),
        'paperh'             :      ('pa', 'paper-hght', self.divide_by_20),
        # annotation => an
        'annotation'         :  ('an', 'annotation', self.default_func),
        # underline
        'ul'                 :      ('ul', 'underlined<continous', self.two_part_func),
        'uld'                :      ('ul', 'underlined<dotted', self.two_part_func),
        'uldash'             :      ('ul', 'underlined<dash', self.two_part_func),
        'uldashd'            :      ('ul', 'underlined<dash-dot', self.two_part_func),
        'uldashdd'           :      ('ul', 'underlined<dash-dot-dot', self.two_part_func),
        'uldb'               :      ('ul', 'underlined<double', self.two_part_func),
        'ulhwave'            :      ('ul', 'underlined<heavy-wave', self.two_part_func),
        'ulldash'            :      ('ul', 'underlined<long-dash', self.two_part_func),
        'ulth'               :      ('ul', 'underlined<thich', self.two_part_func),
        'ulthd'              :      ('ul', 'underlined<thick-dotted', self.two_part_func),
        'ulthdash'           :      ('ul', 'underlined<thick-dash', self.two_part_func),
        'ulthdashd'          :      ('ul', 'underlined<thick-dash-dot', self.two_part_func),
        'ulthdashdd'         :      ('ul', 'underlined<thick-dash-dot-dot', self.two_part_func),
        'ulthldash'          :      ('ul', 'underlined<thick-long-dash', self.two_part_func),
        'ululdbwave'         :      ('ul', 'underlined<double-wave', self.two_part_func),
        'ulw'                :      ('ul', 'underlined<word', self.two_part_func),
        'ulwave'             :      ('ul', 'underlined<wave', self.two_part_func),
        'ulnone'             :      ('ul', 'underlined<false', self.two_part_func),
        # border => bd
        'trbrdrh'            :      ('bd', 'bor-t-r-hi', self.default_func),
        'trbrdrv'            :      ('bd', 'bor-t-r-vi', self.default_func),
        'trbrdrt'            :      ('bd', 'bor-t-r-to', self.default_func),
        'trbrdrl'            :      ('bd', 'bor-t-r-le', self.default_func),
        'trbrdrb'            :      ('bd', 'bor-t-r-bo', self.default_func),
        'trbrdrr'            :      ('bd', 'bor-t-r-ri', self.default_func),
        'clbrdrb'            :      ('bd', 'bor-cel-bo', self.default_func),
        'clbrdrt'            :      ('bd', 'bor-cel-to', self.default_func),
        'clbrdrl'            :      ('bd', 'bor-cel-le', self.default_func),
        'clbrdrr'            :      ('bd', 'bor-cel-ri', self.default_func),
        'brdrb'              :      ('bd', 'bor-par-bo', self.default_func),
        'brdrt'              :      ('bd', 'bor-par-to', self.default_func),
        'brdrl'              :      ('bd', 'bor-par-le', self.default_func),
        'brdrr'              :      ('bd', 'bor-par-ri', self.default_func),
        'box'                :      ('bd', 'bor-par-bx', self.default_func),
        'chbrdr'            : ('bd', 'bor-par-bo', self.default_func),
        'brdrbtw'            :      ('bd', 'bor-for-ev', self.default_func),
        'brdrbar'            :      ('bd', 'bor-outsid', self.default_func),
        'brdrnone'           :      ('bd', 'bor-none__<false', self.two_part_func),
        # border type => bt
        'brdrs'              :      ('bt', 'bdr-single', self.default_func),
        'brdrth'             :      ('bt', 'bdr-doubtb', self.default_func),
        'brdrsh'             :      ('bt', 'bdr-shadow', self.default_func),
        'brdrdb'             :      ('bt', 'bdr-double', self.default_func),
        'brdrdot'            :      ('bt', 'bdr-dotted', self.default_func),
        'brdrdash'           :      ('bt', 'bdr-dashed', self.default_func),
        'brdrhair'           :      ('bt', 'bdr-hair__', self.default_func),
        'brdrinset'          :      ('bt', 'bdr-inset_', self.default_func),
        'brdrdashsm'         :      ('bt', 'bdr-das-sm', self.default_func),
        'brdrdashd'          :      ('bt', 'bdr-dot-sm', self.default_func),
        'brdrdashdd'         :      ('bt', 'bdr-dot-do', self.default_func),
        'brdroutset'         :      ('bt', 'bdr-outset', self.default_func),
        'brdrtriple'         :      ('bt', 'bdr-trippl', self.default_func),
        'brdrtnthsg'         :      ('bt', 'bdr-thsm__', self.default_func),
        'brdrthtnsg'         :      ('bt', 'bdr-htsm__', self.default_func),
        'brdrtnthtnsg'       :      ('bt', 'bdr-hthsm_', self.default_func),
        'brdrtnthmg'         :      ('bt', 'bdr-thm___', self.default_func),
        'brdrthtnmg'         :      ('bt', 'bdr-htm___', self.default_func),
        'brdrtnthtnmg'       :      ('bt', 'bdr-hthm__', self.default_func),
        'brdrtnthlg'         :      ('bt', 'bdr-thl___', self.default_func),
        'brdrtnthtnlg'       :      ('bt', 'bdr-hthl__', self.default_func),
        'brdrwavy'           :      ('bt', 'bdr-wavy__', self.default_func),
        'brdrwavydb'         :      ('bt', 'bdr-d-wav_', self.default_func),
        'brdrdashdotstr'     :      ('bt', 'bdr-strip_', self.default_func),
        'brdremboss'         :      ('bt', 'bdr-embos_', self.default_func),
        'brdrengrave'        :      ('bt', 'bdr-engra_', self.default_func),
        'brdrframe'          :      ('bt', 'bdr-frame_', self.default_func),
        'brdrw'              :      ('bt', 'bdr-li-wid', self.divide_by_20),
        'brsp'              : ('bt', 'bdr-sp-wid', self.divide_by_20),
        'brdrcf'              :     ('bt', 'bdr-color_', self.default_func),
        # comments
        # 'comment'              :  ('cm', 'comment___', self.default_func),
        }
        self.__number_type_dict = {
            0:      'Arabic',
            1:      'uppercase Roman numeral',
            2:      'lowercase Roman numeral',
            3:      'uppercase letter',
            4:      'lowercase letter',
            5:      'ordinal number',
            6:      'cardianl text number',
            7:      'ordinal text number',
            10:     'Kanji numbering without the digit character',
            11:     'Kanji numbering with the digit character',
            1246:   'phonetic Katakana characters in aiueo order',
            1346:    'phonetic katakana characters in iroha order',
            14:     'double byte character',
            15:     'single byte character',
            16:     'Kanji numbering 3',
            17:     'Kanji numbering 4',
            18:     'Circle numbering' ,
            19:     'double-byte Arabic numbering',
            2046:   'phonetic double-byte Katakana characters',
            2146:   'phonetic double-byte katakana characters',
            22:     'Arabic with leading zero',
            23:     'bullet',
            24:     'Korean numbering 2',
            25:     'Korean numbering 1',
            26:     'Chinese numbering 1',
            27:     'Chinese numbering 2',
            28:     'Chinese numbering 3',
            29:     'Chinese numbering 4',
            30:     'Chinese Zodiac numbering 1',
            31:     'Chinese Zodiac numbering 2',
            32:     'Chinese Zodiac numbering 3',
            33:     'Taiwanese double-byte numbering 1',
            34:     'Taiwanese double-byte numbering 2',
            35:     'Taiwanese double-byte numbering 3',
            36:     'Taiwanese double-byte numbering 4',
            37:     'Chinese double-byte numbering 1',
            38:     'Chinese double-byte numbering 2',
            39:     'Chinese double-byte numbering 3',
            40:     'Chinese double-byte numbering 4',
            41:     'Korean double-byte numbering 1',
            42:     'Korean double-byte numbering 2',
            43:     'Korean double-byte numbering 3',
            44:     'Korean double-byte numbering 4',
            45:     'Hebrew non-standard decimal',
            46:     'Arabic Alif Ba Tah',
            47:     'Hebrew Biblical standard',
            48:     'Arabic Abjad style',
            255:    'No number',
        }
        self.__language_dict = {
            1078  :  'Afrikaans',
            1052  :  'Albanian',
            1025  :  'Arabic',
            5121  :  'Arabic Algeria',
            15361       :  'Arabic Bahrain',
            3073  :  'Arabic Egypt',
            1             :   'Arabic General',
            2049  :  'Arabic Iraq',
            11265       :  'Arabic Jordan',
            13313       :  'Arabic Kuwait',
            12289       :  'Arabic Lebanon',
            4097  :  'Arabic Libya',
            6145  :  'Arabic Morocco',
            8193  :  'Arabic Oman',
            16385       :  'Arabic Qatar',
            10241       :  'Arabic Syria',
            7169  :  'Arabic Tunisia',
            14337       :  'Arabic U.A.E.',
            9217  :  'Arabic Yemen',
            1067  :  'Armenian',
            1101  :  'Assamese',
            2092  :  'Azeri Cyrillic',
            1068  :  'Azeri Latin',
            1069  :  'Basque',
            1093  :  'Bengali',
            4122  :  'Bosnia Herzegovina',
            1026  :  'Bulgarian',
            1109  :  'Burmese',
            1059  :  'Byelorussian',
            1027  :  'Catalan',
            2052  :  'Chinese China',
            4             :  'Chinese General',
            3076  :  'Chinese Hong Kong',
            4100  :  'Chinese Singapore',
            1028  :  'Chinese Taiwan',
            1050  :  'Croatian',
            1029  :  'Czech',
            1030  :  'Danish',
            2067  :  'Dutch Belgium',
            1043  :  'Dutch Standard',
            3081  :  'English Australia',
            10249       :  'English Belize',
            2057  :  'English British',
            4105  :  'English Canada',
            9225  :  'English Caribbean',
            9             :  'English General',
            6153  :  'English Ireland',
            8201  :  'English Jamaica',
            5129  :  'English New Zealand',
            13321       :  'English Philippines',
            7177  :  'English South Africa',
            11273       :  'English Trinidad',
            1033  :  'English United States',
            1061  :  'Estonian',
            1080  :  'Faerose',
            1065  :  'Farsi',
            1035  :  'Finnish',
            1036  :  'French',
            2060  :  'French Belgium',
            11276       :  'French Cameroon',
            3084  :  'French Canada',
            12300       :  'French Cote d\'Ivoire',
            5132  :  'French Luxembourg',
            13324       :  'French Mali',
            6156  :  'French Monaco',
            8204  :  'French Reunion',
            10252       :  'French Senegal',
            4108  :  'French Swiss',
            7180  :  'French West Indies',
            9228  :  'French Democratic Republic of the Congo',
            1122  :  'Frisian',
            1084  :  'Gaelic',
            2108  :  'Gaelic Ireland',
            1110  :  'Galician',
            1079  :  'Georgian',
            1031  :  'German',
            3079  :  'German Austrian',
            5127  :  'German Liechtenstein',
            4103  :  'German Luxembourg',
            2055  :  'German Switzerland',
            1032  :  'Greek',
            1095  :  'Gujarati',
            1037  :  'Hebrew',
            1081  :  'Hindi',
            1038  :  'Hungarian',
            1039  :  'Icelandic',
            1057  :  'Indonesian',
            1040  :  'Italian',
            2064  :  'Italian Switzerland',
            1041  :  'Japanese',
            1099  :  'Kannada',
            1120  :  'Kashmiri',
            2144  :  'Kashmiri India',
            1087  :  'Kazakh',
            1107  :  'Khmer',
            1088  :  'Kirghiz',
            1111  :  'Konkani',
            1042  :  'Korean',
            2066  :  'Korean Johab',
            1108  :  'Lao',
            1062  :  'Latvian',
            1063  :  'Lithuanian',
            2087  :  'Lithuanian Classic',
            1086  :  'Malay',
            2110  :  'Malay Brunei Darussalam',
            1100  :  'Malayalam',
            1082  :  'Maltese',
            1112  :  'Manipuri',
            1102  :  'Marathi',
            1104  :  'Mongolian',
            1121  :  'Nepali',
            2145  :  'Nepali India',
            1044  :  'Norwegian Bokmal',
            2068  :  'Norwegian Nynorsk',
            1096  :  'Oriya',
            1045  :  'Polish',
            1046  :  'Portuguese (Brazil)',
            2070  :  'Portuguese (Portugal)',
            1094  :  'Punjabi',
            1047  :  'Rhaeto-Romanic',
            1048  :  'Romanian',
            2072  :  'Romanian Moldova',
            1049  :  'Russian',
            2073  :  'Russian Moldova',
            1083  :  'Sami Lappish',
            1103  :  'Sanskrit',
            3098  :  'Serbian Cyrillic',
            2074  :  'Serbian Latin',
            1113  :  'Sindhi',
            1051  :  'Slovak',
            1060  :  'Slovenian',
            1070  :  'Sorbian',
            11274       :  'Spanish Argentina',
            16394       :  'Spanish Bolivia',
            13322       :  'Spanish Chile',
            9226  :  'Spanish Colombia',
            5130  :  'Spanish Costa Rica',
            7178  :  'Spanish Dominican Republic',
            12298       :  'Spanish Ecuador',
            17418       :  'Spanish El Salvador',
            4106  :  'Spanish Guatemala',
            18442       :  'Spanish Honduras',
            2058  :  'Spanish Mexico',
            3082  :  'Spanish Modern',
            19466       :  'Spanish Nicaragua',
            6154  :  'Spanish Panama',
            15370       :  'Spanish Paraguay',
            10250       :  'Spanish Peru',
            20490       :  'Spanish Puerto Rico',
            1034  :  'Spanish Traditional',
            14346       :  'Spanish Uruguay',
            8202  :  'Spanish Venezuela',
            1072  :  'Sutu',
            1089  :  'Swahili',
            1053  :  'Swedish',
            2077  :  'Swedish Finland',
            1064  :  'Tajik',
            1097  :  'Tamil',
            1092  :  'Tatar',
            1098  :  'Telugu',
            1054  :  'Thai',
            1105  :  'Tibetan',
            1073  :  'Tsonga',
            1074  :  'Tswana',
            1055  :  'Turkish',
            1090  :  'Turkmen',
            1058  :  'Ukranian',
            1056  :  'Urdu',
            2080  :  'Urdu India',
            2115  :  'Uzbek Cyrillic',
            1091  :  'Uzbek Latin',
            1075  :  'Venda',
            1066  :  'Vietnamese',
            1106  :  'Welsh',
            1076  :  'Xhosa',
            1085  :  'Yiddish',
            1077  :  'Zulu',
            1024  :  'Unkown',
            255   :  'Unkown',
        }
    """
        # unknown
        # These must get passed on because they occure after \*
        'do'                :   ('un', 'unknown___', self.default_func),
        'company'           : ('un', 'company___', self.default_func),
        'shpinst'           :   ('un', 'unknown___', self.default_func),
        'panose'            :   ('un', 'unknown___', self.default_func),
        'falt'              :   ('un', 'unknown___', self.default_func),
        'listoverridetable' :   ('un', 'unknown___', self.default_func),
        'category'          :   ('un', 'unknown___', self.default_func),
        'template'          :   ('un', 'unknown___', self.default_func),
        'ud'                :   ('un', 'unknown___', self.default_func),
        'formfield'         :   ('un', 'unknown___', self.default_func),
        'ts'                :   ('un', 'unknown___', self.default_func),
        'rsidtbl'           :   ('un', 'unknown___', self.default_func),
        'generator'         :   ('un', 'unknown___', self.default_func),
        'ftnsep'            :   ('un', 'unknown___', self.default_func),
        'aftnsep'           :   ('un', 'unknown___', self.default_func),
        'aftnsepc'           :   ('un', 'unknown___', self.default_func),
        'aftncn'            :   ('un', 'unknown___', self.default_func),
        'objclass'           :   ('un', 'unknown___', self.default_func),
        'objdata'           :   ('un', 'unknown___', self.default_func),
        'picprop'           :   ('un', 'unknown___', self.default_func),
        'blipuid'           :   ('un', 'unknown___', self.default_func),
    """
    def __ms_hex_func(self, pre, token, num):
        num = num[1:] # chop off leading 0, which I added
        num = num.upper() # the mappings store hex in caps
        return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
    def ms_sub_func(self, pre, token, num):
        return 'tx<mc<__________<%s\n' % token
    def default_func(self, pre, token, num):
        if num == None:
            num = 'true'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
    def __list_type_func(self, pre, token, num):
        type = 'arabic'
        if num == None:
            type = 'Arabic'
        else:
            try:
                num = int(num)
            except ValueError:
                if self.__run_level > 3:
                    msg = 'number "%s" cannot be converted to integer\n' % num
                    raise self.__bug_handler, msg
            type = self.__number_type_dict.get(num)
            if type == None:
                if self.__run_level > 3:
                    msg = 'No type for "%s" in self.__number_type_dict\n'
                    raise self.__bug_handler
                type = 'Arabic'
        return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
    def __language_func(self, pre, token, num):
        lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
        if not lang_name:
            lang_name = "not defined"
            if self.__run_level > 3:
                msg = 'No entry for number "%s"' % num
                raise self.__bug_handler, msg
        return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
    def two_part_func(self, pre, token, num):
        list = token.split("<")
        token = list[0]
        num = list[1]
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
    def divide_by_2(self, pre, token, num):
        num = self.divide_num(num, 2)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
    def divide_by_20(self, pre, token, num):
        num = self.divide_num(num, 20)
        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
        ##return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
    def text_func(self, pre, token, num=None):
        return 'tx<nu<__________<%s\n' % token
    def ob_func(self, pre, token, num=None):
        self.__bracket_count += 1
        ##return 'ob<%04d\n' % self.__bracket_count
        return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
    def cb_func(self, pre, token, num=None):
        ##line = 'cb<%04d\n' % self.__bracket_count
        line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
        self.__bracket_count -= 1
        return line
    def color_func(self, pre, token, num):
        third_field = 'nu'
        if num[-1] == ';':
            num = num[:-1]
            third_field = 'en'
        num = str('%X' % int(num))
        if len(num) != 2:
            num = "0" + num
        return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
        ##return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
    def bool_st_func(self, pre, token, num):
        if num is None or num == '' or num == '1':
            return 'cw<%s<%s<nu<true\n' % (pre, token)
            ##return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
        elif num == '0':
            return 'cw<%s<%s<nu<false\n' % (pre, token)
                ##return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
        else:
            msg = 'boolean should have some value module process tokens\n'
            msg += 'token is ' + token + "\n"
            msg += "'" + num + "'" + "\n"
            raise self.__bug_handler, msg
    def __no_sup_sub_func(self, pre, token, num):
        the_string = 'cw<ci<subscript_<nu<false\n'
        the_string += 'cw<ci<superscrip<nu<false\n'
        return the_string
    def divide_num(self, numerator, denominator):
        try:
            numerator = float(re.search('[0-9.]+', numerator).group())            
        except TypeError, msg:
            if self.__run_level > 3:
                msg = 'no number to process?\n'
                msg += 'this indicates that the token '
                msg += ' \(\\li\) should have a number and does not\n'
                msg += 'numerator is "%s"\n' % numerator
                msg += 'denominator is "%s"\n' % denominator
                raise self.__bug_handler, msg
            if 5 > self.__return_code:
                self.__return_code = 5
            return 0
        num = '%0.2f' % round(numerator/denominator, 2)
        return num
        string_num = str(num)
        if string_num[-2:] == ".0":
            string_num = string_num[:-2]
        return string_num
    def split_let_num(self, token):
        match_obj = re.search(self.__num_exp,token)
        if match_obj != None:
            first = match_obj.group(1)
            second = match_obj.group(2)
            if not second:
                if self.__run_level > 3:
                    msg = "token is '%s' \n" % token
                    raise self.__bug_handler, msg
                return first, 0
        else:
            if self.__run_level > 3:
                msg = "token is '%s' \n" % token
                raise self.__bug_handler
            return token, 0
        return first, second
00716     def convert_to_hex(self,number):
        """Convert a string to uppercase hexidecimal"""
        num = int(number)
        try:
            hex_num = "%X" % num
            return hex_num
        except:
            raise self.__bug_handler
00724     def process_cw(self, token):
        """Change the value of the control word by determining what dictionary
        it belongs to"""
        special = [  '*', ':', '}', '{',   '~', '_', '-', ';' ]
        ##if token != "{" or token != "}":
        token = token[1:] # strip off leading \
        token = token.replace(" ", "")
        ##if not token: return
        only_alpha = token.isalpha()
        num = None
        if not only_alpha and token not in special:
            token, num = self.split_let_num(token)
        pre, token, action = self.dict_token.get(token, (None, None, None))
        if action:
            return action(pre, token, num)
    # unused function
    def initiate_token_actions(self):
        self.action_for_token={
        '{'     :   self.ob_func,
        '}'     :   self.cb_func,
        '\\'    :   self.process_cw,
        }
    # unused function
00747     def evaluate_token(self,token):
        """Evaluate tokens. Return a value if the token is not a
        control word. Otherwise, pass token onto another method
        for further evaluation."""
        token, action = self.dict_token.get(token[0:1])
        if action:
            line = action(token)
            return line
        else :
            return  'tx<nu<nu<nu<nu<%s\n' % token
    def __check_brackets(self, in_file):
        self.__check_brack_obj = check_brackets.CheckBrackets\
            (file = in_file)
        good_br =  self.__check_brack_obj.check_brackets()[0]
        if not good_br:
            return 1
00763     def process_tokens(self):
        """Main method for handling other methods. """
        first_token = 0
        second_token = 0
        read_obj = open(self.__file, 'r')
        write_obj = open(self.__write_to, 'w')
        line_to_read = "dummy"
        line_count = 0
        while line_to_read:
            line_to_read = read_obj.readline()
            token = line_to_read
            token = token.replace("\n","")
            if not token:
                continue
            line_count += 1
            try:
                token.decode('us-ascii')
            except UnicodeError, msg:
                msg = str(msg)
                msg += 'Invalid RTF: File not ascii encoded.\n'
                raise self.__exception_handler, msg
            if not first_token:
                if token != '\\{':
                    msg = 'Invalid RTF: document doesn\'t start with {\n'
                    raise self.__exception_handler, msg
                first_token = 1
            elif first_token and not second_token:
                if token[0:4] != '\\rtf':
                    msg ='Invalid RTF: document doesn\'t start with \\rtf \n'
                    raise self.__exception_handler, msg
                second_token = 1
            ##token = self.evaluate_token(token)
            the_index = token.find('\\ ')
            if token != None and  the_index > -1:
                msg ='Invalid RTF: token "\\ " not valid. \n'
                raise self.__exception_handler, msg
            elif token[0:1] == "\\":
                line = self.process_cw(token)
                if line != None:
                    write_obj.write(line)
            else:
                fields = re.split(self.__utf_exp, token)
                for field in fields:
                    if not field:
                        continue
                    if field[0:1] == '&':
                        write_obj.write('tx<ut<__________<%s\n' % field)
                    else:
                        write_obj.write('tx<nu<__________<%s\n' % field)
        read_obj.close()
        write_obj.close()
        if not line_count:
            msg ='Invalid RTF: file appears to be empty. \n'
            raise self.__exception_handler, msg
        copy_obj = copy.Copy(bug_handler = self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)
        bad_brackets = self.__check_brackets(self.__file)
        if bad_brackets:
            msg = 'Invalid RTF: document does not have matching brackets.\n'
            raise self.__exception_handler, msg
        else:
            return self.__return_code

Generated by  Doxygen 1.6.0   Back to index