i18n/polib.py
changeset 40185 19fc5a986669
parent 32889 a7310a477966
child 43076 2372284d9457
equal deleted inserted replaced
40184:c3b7d9c54edd 40185:19fc5a986669
     1 # -*- coding: utf-8 -*-
       
     2 # no-check-code
     1 # no-check-code
       
     2 # -* coding: utf-8 -*-
     3 #
     3 #
     4 # License: MIT (see LICENSE file provided)
     4 # License: MIT (see LICENSE file provided)
     5 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
     5 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
     6 
     6 
     7 """
     7 """
    13 :func:`~polib.mofile` convenience functions.
    13 :func:`~polib.mofile` convenience functions.
    14 """
    14 """
    15 
    15 
    16 from __future__ import absolute_import
    16 from __future__ import absolute_import
    17 
    17 
    18 __author__    = 'David Jean Louis <izimobil@gmail.com>'
    18 __author__ = 'David Jean Louis <izimobil@gmail.com>'
    19 __version__   = '0.6.4'
    19 __version__ = '1.0.7'
    20 __all__       = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
    20 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
    21                  'detect_encoding', 'escape', 'unescape', 'detect_encoding',]
    21            'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
    22 
    22 
    23 import array
    23 import array
    24 import codecs
    24 import codecs
    25 import os
    25 import os
    26 import re
    26 import re
    27 import struct
    27 import struct
    28 import sys
    28 import sys
    29 import textwrap
    29 import textwrap
    30 import types
    30 
       
    31 try:
       
    32     import io
       
    33 except ImportError:
       
    34     # replacement of io.open() for python < 2.6
       
    35     # we use codecs instead
       
    36     class io(object):
       
    37         @staticmethod
       
    38         def open(fpath, mode='r', encoding=None):
       
    39             return codecs.open(fpath, mode, encoding)
    31 
    40 
    32 
    41 
    33 # the default encoding to use when encoding cannot be detected
    42 # the default encoding to use when encoding cannot be detected
    34 default_encoding = 'utf-8'
    43 default_encoding = 'utf-8'
    35 
    44 
       
    45 # python 2/3 compatibility helpers {{{
       
    46 
       
    47 
       
    48 if sys.version_info[:2] < (3, 0):
       
    49     PY3 = False
       
    50     text_type = unicode
       
    51 
       
    52     def b(s):
       
    53         return s
       
    54 
       
    55     def u(s):
       
    56         return unicode(s, "unicode_escape")
       
    57 
       
    58 else:
       
    59     PY3 = True
       
    60     text_type = str
       
    61 
       
    62     def b(s):
       
    63         return s.encode("latin-1")
       
    64 
       
    65     def u(s):
       
    66         return s
       
    67 # }}}
    36 # _pofile_or_mofile {{{
    68 # _pofile_or_mofile {{{
       
    69 
    37 
    70 
    38 def _pofile_or_mofile(f, type, **kwargs):
    71 def _pofile_or_mofile(f, type, **kwargs):
    39     """
    72     """
    40     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
    73     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
    41     honor the DRY concept.
    74     honor the DRY concept.
    48     # parse the file
    81     # parse the file
    49     kls = type == 'pofile' and _POFileParser or _MOFileParser
    82     kls = type == 'pofile' and _POFileParser or _MOFileParser
    50     parser = kls(
    83     parser = kls(
    51         f,
    84         f,
    52         encoding=enc,
    85         encoding=enc,
    53         check_for_duplicates=kwargs.get('check_for_duplicates', False)
    86         check_for_duplicates=kwargs.get('check_for_duplicates', False),
       
    87         klass=kwargs.get('klass')
    54     )
    88     )
    55     instance = parser.parse()
    89     instance = parser.parse()
    56     instance.wrapwidth = kwargs.get('wrapwidth', 78)
    90     instance.wrapwidth = kwargs.get('wrapwidth', 78)
    57     return instance
    91     return instance
    58 
    92 # }}}
       
    93 # _is_file {{{
       
    94 
       
    95 
       
    96 def _is_file(filename_or_contents):
       
    97     """
       
    98     Safely returns the value of os.path.exists(filename_or_contents).
       
    99 
       
   100     Arguments:
       
   101 
       
   102     ``filename_or_contents``
       
   103         either a filename, or a string holding the contents of some file.
       
   104         In the latter case, this function will always return False.
       
   105     """
       
   106     try:
       
   107         return os.path.exists(filename_or_contents)
       
   108     except (ValueError, UnicodeEncodeError):
       
   109         return False
    59 # }}}
   110 # }}}
    60 # function pofile() {{{
   111 # function pofile() {{{
       
   112 
    61 
   113 
    62 def pofile(pofile, **kwargs):
   114 def pofile(pofile, **kwargs):
    63     """
   115     """
    64     Convenience function that parses the po or pot file ``pofile`` and returns
   116     Convenience function that parses the po or pot file ``pofile`` and returns
    65     a :class:`~polib.POFile` instance.
   117     a :class:`~polib.POFile` instance.
    78         encoding will be auto-detected).
   130         encoding will be auto-detected).
    79 
   131 
    80     ``check_for_duplicates``
   132     ``check_for_duplicates``
    81         whether to check for duplicate entries when adding entries to the
   133         whether to check for duplicate entries when adding entries to the
    82         file (optional, default: ``False``).
   134         file (optional, default: ``False``).
       
   135 
       
   136     ``klass``
       
   137         class which is used to instantiate the return value (optional,
       
   138         default: ``None``, the return value with be a :class:`~polib.POFile`
       
   139         instance).
    83     """
   140     """
    84     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
   141     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
    85 
       
    86 # }}}
   142 # }}}
    87 # function mofile() {{{
   143 # function mofile() {{{
       
   144 
    88 
   145 
    89 def mofile(mofile, **kwargs):
   146 def mofile(mofile, **kwargs):
    90     """
   147     """
    91     Convenience function that parses the mo file ``mofile`` and returns a
   148     Convenience function that parses the mo file ``mofile`` and returns a
    92     :class:`~polib.MOFile` instance.
   149     :class:`~polib.MOFile` instance.
   106         encoding will be auto-detected).
   163         encoding will be auto-detected).
   107 
   164 
   108     ``check_for_duplicates``
   165     ``check_for_duplicates``
   109         whether to check for duplicate entries when adding entries to the
   166         whether to check for duplicate entries when adding entries to the
   110         file (optional, default: ``False``).
   167         file (optional, default: ``False``).
       
   168 
       
   169     ``klass``
       
   170         class which is used to instantiate the return value (optional,
       
   171         default: ``None``, the return value with be a :class:`~polib.POFile`
       
   172         instance).
   111     """
   173     """
   112     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
   174     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
   113 
       
   114 # }}}
   175 # }}}
   115 # function detect_encoding() {{{
   176 # function detect_encoding() {{{
       
   177 
   116 
   178 
   117 def detect_encoding(file, binary_mode=False):
   179 def detect_encoding(file, binary_mode=False):
   118     """
   180     """
   119     Try to detect the encoding used by the ``file``. The ``file`` argument can
   181     Try to detect the encoding used by the ``file``. The ``file`` argument can
   120     be a PO or MO file path or a string containing the contents of the file.
   182     be a PO or MO file path or a string containing the contents of the file.
   127         string, full or relative path to the po/mo file or its content.
   189         string, full or relative path to the po/mo file or its content.
   128 
   190 
   129     ``binary_mode``
   191     ``binary_mode``
   130         boolean, set this to True if ``file`` is a mo file.
   192         boolean, set this to True if ``file`` is a mo file.
   131     """
   193     """
   132     rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)')
   194     PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
       
   195     rxt = re.compile(u(PATTERN))
       
   196     rxb = re.compile(b(PATTERN))
   133 
   197 
   134     def charset_exists(charset):
   198     def charset_exists(charset):
   135         """Check whether ``charset`` is valid or not."""
   199         """Check whether ``charset`` is valid or not."""
   136         try:
   200         try:
   137             codecs.lookup(charset)
   201             codecs.lookup(charset)
   138         except LookupError:
   202         except LookupError:
   139             return False
   203             return False
   140         return True
   204         return True
   141 
   205 
   142     if not os.path.exists(file):
   206     if not _is_file(file):
   143         match = rx.search(file)
   207         match = rxt.search(file)
   144         if match:
   208         if match:
   145             enc = match.group(1).strip()
   209             enc = match.group(1).strip()
   146             if charset_exists(enc):
   210             if charset_exists(enc):
   147                 return enc
   211                 return enc
   148     else:
   212     else:
   149         if binary_mode:
   213         # For PY3, always treat as binary
       
   214         if binary_mode or PY3:
   150             mode = 'rb'
   215             mode = 'rb'
       
   216             rx = rxb
   151         else:
   217         else:
   152             mode = 'r'
   218             mode = 'r'
       
   219             rx = rxt
   153         f = open(file, mode)
   220         f = open(file, mode)
   154         for l in f.readlines():
   221         for l in f.readlines():
   155             match = rx.search(l)
   222             match = rx.search(l)
   156             if match:
   223             if match:
   157                 f.close()
   224                 f.close()
   158                 enc = match.group(1).strip()
   225                 enc = match.group(1).strip()
       
   226                 if not isinstance(enc, text_type):
       
   227                     enc = enc.decode('utf-8')
   159                 if charset_exists(enc):
   228                 if charset_exists(enc):
   160                     return enc
   229                     return enc
   161         f.close()
   230         f.close()
   162     return default_encoding
   231     return default_encoding
   163 
       
   164 # }}}
   232 # }}}
   165 # function escape() {{{
   233 # function escape() {{{
       
   234 
   166 
   235 
   167 def escape(st):
   236 def escape(st):
   168     """
   237     """
   169     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
   238     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
   170     the given string ``st`` and returns it.
   239     the given string ``st`` and returns it.
   172     return st.replace('\\', r'\\')\
   241     return st.replace('\\', r'\\')\
   173              .replace('\t', r'\t')\
   242              .replace('\t', r'\t')\
   174              .replace('\r', r'\r')\
   243              .replace('\r', r'\r')\
   175              .replace('\n', r'\n')\
   244              .replace('\n', r'\n')\
   176              .replace('\"', r'\"')
   245              .replace('\"', r'\"')
   177 
       
   178 # }}}
   246 # }}}
   179 # function unescape() {{{
   247 # function unescape() {{{
       
   248 
   180 
   249 
   181 def unescape(st):
   250 def unescape(st):
   182     """
   251     """
   183     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
   252     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
   184     the given string ``st`` and returns it.
   253     the given string ``st`` and returns it.
   191             return '\t'
   260             return '\t'
   192         if m == 'r':
   261         if m == 'r':
   193             return '\r'
   262             return '\r'
   194         if m == '\\':
   263         if m == '\\':
   195             return '\\'
   264             return '\\'
   196         return m # handles escaped double quote
   265         return m  # handles escaped double quote
   197     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
   266     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
   198 
       
   199 # }}}
   267 # }}}
   200 # class _BaseFile {{{
   268 # class _BaseFile {{{
       
   269 
   201 
   270 
   202 class _BaseFile(list):
   271 class _BaseFile(list):
   203     """
   272     """
   204     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
   273     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
   205     classes. This class should **not** be instanciated directly.
   274     classes. This class should **not** be instanciated directly.
   225             file, (optional, default: ``False``).
   294             file, (optional, default: ``False``).
   226         """
   295         """
   227         list.__init__(self)
   296         list.__init__(self)
   228         # the opened file handle
   297         # the opened file handle
   229         pofile = kwargs.get('pofile', None)
   298         pofile = kwargs.get('pofile', None)
   230         if pofile and os.path.exists(pofile):
   299         if pofile and _is_file(pofile):
   231             self.fpath = pofile
   300             self.fpath = pofile
   232         else:
   301         else:
   233             self.fpath = kwargs.get('fpath')
   302             self.fpath = kwargs.get('fpath')
   234         # the width at which lines should be wrapped
   303         # the width at which lines should be wrapped
   235         self.wrapwidth = kwargs.get('wrapwidth', 78)
   304         self.wrapwidth = kwargs.get('wrapwidth', 78)
   252                   [e for e in self if not e.obsolete]
   321                   [e for e in self if not e.obsolete]
   253         for entry in entries:
   322         for entry in entries:
   254             ret.append(entry.__unicode__(self.wrapwidth))
   323             ret.append(entry.__unicode__(self.wrapwidth))
   255         for entry in self.obsolete_entries():
   324         for entry in self.obsolete_entries():
   256             ret.append(entry.__unicode__(self.wrapwidth))
   325             ret.append(entry.__unicode__(self.wrapwidth))
   257         ret = '\n'.join(ret)
   326         ret = u('\n').join(ret)
   258 
   327 
   259         if type(ret) != types.UnicodeType:
   328         assert isinstance(ret, text_type)
   260             return unicode(ret, self.encoding)
   329         #if type(ret) != text_type:
       
   330         #    return unicode(ret, self.encoding)
   261         return ret
   331         return ret
   262 
   332 
   263     def __str__(self):
   333     if PY3:
   264         """
   334         def __str__(self):
   265         Returns the string representation of the file.
   335             return self.__unicode__()
   266         """
   336     else:
   267         return unicode(self).encode(self.encoding)
   337         def __str__(self):
       
   338             """
       
   339             Returns the string representation of the file.
       
   340             """
       
   341             return unicode(self).encode(self.encoding)
   268 
   342 
   269     def __contains__(self, entry):
   343     def __contains__(self, entry):
   270         """
   344         """
   271         Overriden ``list`` method to implement the membership test (in and
   345         Overridden ``list`` method to implement the membership test (in and
   272         not in).
   346         not in).
   273         The method considers that an entry is in the file if it finds an entry
   347         The method considers that an entry is in the file if it finds an entry
   274         that has the same msgid (the test is **case sensitive**).
   348         that has the same msgid (the test is **case sensitive**) and the same
       
   349         msgctxt (or none for both entries).
   275 
   350 
   276         Argument:
   351         Argument:
   277 
   352 
   278         ``entry``
   353         ``entry``
   279             an instance of :class:`~polib._BaseEntry`.
   354             an instance of :class:`~polib._BaseEntry`.
   280         """
   355         """
   281         return self.find(entry.msgid, by='msgid') is not None
   356         return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
       
   357             is not None
   282 
   358 
   283     def __eq__(self, other):
   359     def __eq__(self, other):
   284         return unicode(self) == unicode(other)
   360         return str(self) == str(other)
   285 
   361 
   286     def append(self, entry):
   362     def append(self, entry):
   287         """
   363         """
   288         Overriden method to check for duplicates entries, if a user tries to
   364         Overridden method to check for duplicates entries, if a user tries to
   289         add an entry that is already in the file, the method will raise a
   365         add an entry that is already in the file, the method will raise a
   290         ``ValueError`` exception.
   366         ``ValueError`` exception.
   291 
   367 
   292         Argument:
   368         Argument:
   293 
   369 
   298             raise ValueError('Entry "%s" already exists' % entry.msgid)
   374             raise ValueError('Entry "%s" already exists' % entry.msgid)
   299         super(_BaseFile, self).append(entry)
   375         super(_BaseFile, self).append(entry)
   300 
   376 
   301     def insert(self, index, entry):
   377     def insert(self, index, entry):
   302         """
   378         """
   303         Overriden method to check for duplicates entries, if a user tries to
   379         Overridden method to check for duplicates entries, if a user tries to
   304         add an entry that is already in the file, the method will raise a
   380         add an entry that is already in the file, the method will raise a
   305         ``ValueError`` exception.
   381         ``ValueError`` exception.
   306 
   382 
   307         Arguments:
   383         Arguments:
   308 
   384 
   330             e.msgstr = '\n'.join(strs) + '\n'
   406             e.msgstr = '\n'.join(strs) + '\n'
   331         if self.metadata_is_fuzzy:
   407         if self.metadata_is_fuzzy:
   332             e.flags.append('fuzzy')
   408             e.flags.append('fuzzy')
   333         return e
   409         return e
   334 
   410 
   335     def save(self, fpath=None, repr_method='__str__'):
   411     def save(self, fpath=None, repr_method='__unicode__'):
   336         """
   412         """
   337         Saves the po file to ``fpath``.
   413         Saves the po file to ``fpath``.
   338         If it is an existing file and no ``fpath`` is provided, then the
   414         If it is an existing file and no ``fpath`` is provided, then the
   339         existing file is rewritten with the modified data.
   415         existing file is rewritten with the modified data.
   340 
   416 
   352         if fpath is None:
   428         if fpath is None:
   353             fpath = self.fpath
   429             fpath = self.fpath
   354         if repr_method == 'to_binary':
   430         if repr_method == 'to_binary':
   355             fhandle = open(fpath, 'wb')
   431             fhandle = open(fpath, 'wb')
   356         else:
   432         else:
   357             fhandle = codecs.open(fpath, 'w', self.encoding)
   433             fhandle = io.open(fpath, 'w', encoding=self.encoding)
   358             if type(contents) != types.UnicodeType:
   434             if not isinstance(contents, text_type):
   359                 contents = contents.decode(self.encoding)
   435                 contents = contents.decode(self.encoding)
   360         fhandle.write(contents)
   436         fhandle.write(contents)
   361         fhandle.close()
   437         fhandle.close()
   362         # set the file path if not set
   438         # set the file path if not set
   363         if self.fpath is None and fpath:
   439         if self.fpath is None and fpath:
   379 
   455 
   380         ``include_obsolete_entries``
   456         ``include_obsolete_entries``
   381             boolean, whether to also search in entries that are obsolete.
   457             boolean, whether to also search in entries that are obsolete.
   382 
   458 
   383         ``msgctxt``
   459         ``msgctxt``
   384             string, allows to specify a specific message context for the
   460             string, allows specifying a specific message context for the
   385             search.
   461             search.
   386         """
   462         """
   387         if include_obsolete_entries:
   463         if include_obsolete_entries:
   388             entries = self[:]
   464             entries = self[:]
   389         else:
   465         else:
   390             entries = [e for e in self if not e.obsolete]
   466             entries = [e for e in self if not e.obsolete]
   391         for e in entries:
   467         for e in entries:
   392             if getattr(e, by) == st:
   468             if getattr(e, by) == st:
   393                 if msgctxt and e.msgctxt != msgctxt:
   469                 if msgctxt is not False and e.msgctxt != msgctxt:
   394                     continue
   470                     continue
   395                 return e
   471                 return e
   396         return None
   472         return None
   397 
   473 
   398     def ordered_metadata(self):
   474     def ordered_metadata(self):
   410             'PO-Revision-Date',
   486             'PO-Revision-Date',
   411             'Last-Translator',
   487             'Last-Translator',
   412             'Language-Team',
   488             'Language-Team',
   413             'MIME-Version',
   489             'MIME-Version',
   414             'Content-Type',
   490             'Content-Type',
   415             'Content-Transfer-Encoding'
   491             'Content-Transfer-Encoding',
       
   492             'Language',
       
   493             'Plural-Forms'
   416         ]
   494         ]
   417         ordered_data = []
   495         ordered_data = []
   418         for data in data_order:
   496         for data in data_order:
   419             try:
   497             try:
   420                 value = metadata.pop(data)
   498                 value = metadata.pop(data)
   421                 ordered_data.append((data, value))
   499                 ordered_data.append((data, value))
   422             except KeyError:
   500             except KeyError:
   423                 pass
   501                 pass
   424         # the rest of the metadata will be alphabetically ordered since there
   502         # the rest of the metadata will be alphabetically ordered since there
   425         # are no specs for this AFAIK
   503         # are no specs for this AFAIK
   426         keys = metadata.keys()
   504         for data in sorted(metadata.keys()):
   427         keys.sort()
       
   428         for data in keys:
       
   429             value = metadata[data]
   505             value = metadata[data]
   430             ordered_data.append((data, value))
   506             ordered_data.append((data, value))
   431         return ordered_data
   507         return ordered_data
   432 
   508 
   433     def to_binary(self):
   509     def to_binary(self):
   434         """
   510         """
   435         Return the binary representation of the file.
   511         Return the binary representation of the file.
   436         """
   512         """
   437         offsets = []
   513         offsets = []
   438         entries = self.translated_entries()
   514         entries = self.translated_entries()
       
   515 
   439         # the keys are sorted in the .mo file
   516         # the keys are sorted in the .mo file
   440         def cmp(_self, other):
   517         def cmp(_self, other):
   441             # msgfmt compares entries with msgctxt if it exists
   518             # msgfmt compares entries with msgctxt if it exists
   442             if _self.msgctxt:
   519             self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
   443                 self_msgid = _self.msgctxt
   520             other_msgid = other.msgctxt and other.msgctxt or other.msgid
   444             else:
       
   445                 self_msgid = _self.msgid
       
   446 
       
   447             if other.msgctxt:
       
   448                 other_msgid = other.msgctxt
       
   449             else:
       
   450                 other_msgid = other.msgid
       
   451             if self_msgid > other_msgid:
   521             if self_msgid > other_msgid:
   452                 return 1
   522                 return 1
   453             elif self_msgid < other_msgid:
   523             elif self_msgid < other_msgid:
   454                 return -1
   524                 return -1
   455             else:
   525             else:
   456                 return 0
   526                 return 0
   457         # add metadata entry
   527         # add metadata entry
   458         entries.sort(cmp)
   528         entries.sort(key=lambda o: o.msgctxt or o.msgid)
   459         mentry = self.metadata_as_entry()
   529         mentry = self.metadata_as_entry()
   460         #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
   530         #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
   461         entries = [mentry] + entries
   531         entries = [mentry] + entries
   462         entries_len = len(entries)
   532         entries_len = len(entries)
   463         ids, strs = '', ''
   533         ids, strs = b(''), b('')
   464         for e in entries:
   534         for e in entries:
   465             # For each string, we need size and file offset.  Each string is
   535             # For each string, we need size and file offset.  Each string is
   466             # NUL terminated; the NUL does not count into the size.
   536             # NUL terminated; the NUL does not count into the size.
   467             msgid = ''
   537             msgid = b('')
   468             if e.msgctxt:
   538             if e.msgctxt:
   469                 # Contexts are stored by storing the concatenation of the
   539                 # Contexts are stored by storing the concatenation of the
   470                 # context, a <EOT> byte, and the original string
   540                 # context, a <EOT> byte, and the original string
   471                 msgid = self._encode(e.msgctxt + '\4')
   541                 msgid = self._encode(e.msgctxt + '\4')
   472             if e.msgid_plural:
   542             if e.msgid_plural:
   473                 indexes = e.msgstr_plural.keys()
       
   474                 indexes.sort()
       
   475                 msgstr = []
   543                 msgstr = []
   476                 for index in indexes:
   544                 for index in sorted(e.msgstr_plural.keys()):
   477                     msgstr.append(e.msgstr_plural[index])
   545                     msgstr.append(e.msgstr_plural[index])
   478                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
   546                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
   479                 msgstr = self._encode('\0'.join(msgstr))
   547                 msgstr = self._encode('\0'.join(msgstr))
   480             else:
   548             else:
   481                 msgid += self._encode(e.msgid)
   549                 msgid += self._encode(e.msgid)
   482                 msgstr = self._encode(e.msgstr)
   550                 msgstr = self._encode(e.msgstr)
   483             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
   551             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
   484             ids  += msgid  + '\0'
   552             ids += msgid + b('\0')
   485             strs += msgstr + '\0'
   553             strs += msgstr + b('\0')
   486 
   554 
   487         # The header is 7 32-bit unsigned integers.
   555         # The header is 7 32-bit unsigned integers.
   488         keystart = 7*4+16*entries_len
   556         keystart = 7 * 4 + 16 * entries_len
   489         # and the values start after the keys
   557         # and the values start after the keys
   490         valuestart = keystart + len(ids)
   558         valuestart = keystart + len(ids)
   491         koffsets = []
   559         koffsets = []
   492         voffsets = []
   560         voffsets = []
   493         # The string table first has the list of keys, then the list of values.
   561         # The string table first has the list of keys, then the list of values.
   494         # Each entry has first the size of the string, then the file offset.
   562         # Each entry has first the size of the string, then the file offset.
   495         for o1, l1, o2, l2 in offsets:
   563         for o1, l1, o2, l2 in offsets:
   496             koffsets += [l1, o1+keystart]
   564             koffsets += [l1, o1 + keystart]
   497             voffsets += [l2, o2+valuestart]
   565             voffsets += [l2, o2 + valuestart]
   498         offsets = koffsets + voffsets
   566         offsets = koffsets + voffsets
   499         # check endianness for magic number
       
   500         if struct.pack('@h', 1) == struct.pack('<h', 1):
       
   501             magic_number = MOFile.LITTLE_ENDIAN
       
   502         else:
       
   503             magic_number = MOFile.BIG_ENDIAN
       
   504 
   567 
   505         output = struct.pack(
   568         output = struct.pack(
   506             "Iiiiiii",
   569             "Iiiiiii",
   507             magic_number,      # Magic number
   570             # Magic number
   508             0,                 # Version
   571             MOFile.MAGIC,
   509             entries_len,       # # of entries
   572             # Version
   510             7*4,               # start of key index
   573             0,
   511             7*4+entries_len*8, # start of value index
   574             # number of entries
   512             0, keystart        # size and offset of hash table
   575             entries_len,
   513                                # Important: we don't use hash tables
   576             # start of key index
       
   577             7 * 4,
       
   578             # start of value index
       
   579             7 * 4 + entries_len * 8,
       
   580             # size and offset of hash table, we don't use hash tables
       
   581             0, keystart
       
   582 
   514         )
   583         )
   515         output += array.array("i", offsets).tostring()
   584         if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
       
   585             output += array.array("i", offsets).tobytes()
       
   586         else:
       
   587             output += array.array("i", offsets).tostring()
   516         output += ids
   588         output += ids
   517         output += strs
   589         output += strs
   518         return output
   590         return output
   519 
   591 
   520     def _encode(self, mixed):
   592     def _encode(self, mixed):
   521         """
   593         """
   522         Encodes the given ``mixed`` argument with the file encoding if and
   594         Encodes the given ``mixed`` argument with the file encoding if and
   523         only if it's an unicode string and returns the encoded string.
   595         only if it's an unicode string and returns the encoded string.
   524         """
   596         """
   525         if type(mixed) == types.UnicodeType:
   597         if isinstance(mixed, text_type):
   526             return mixed.encode(self.encoding)
   598             mixed = mixed.encode(self.encoding)
   527         return mixed
   599         return mixed
   528 
       
   529 # }}}
   600 # }}}
   530 # class POFile {{{
   601 # class POFile {{{
       
   602 
   531 
   603 
   532 class POFile(_BaseFile):
   604 class POFile(_BaseFile):
   533     """
   605     """
   534     Po (or Pot) file reader/writer.
   606     Po (or Pot) file reader/writer.
   535     This class inherits the :class:`~polib._BaseFile` class and, by extension,
   607     This class inherits the :class:`~polib._BaseFile` class and, by extension,
   540         """
   612         """
   541         Returns the unicode representation of the po file.
   613         Returns the unicode representation of the po file.
   542         """
   614         """
   543         ret, headers = '', self.header.split('\n')
   615         ret, headers = '', self.header.split('\n')
   544         for header in headers:
   616         for header in headers:
   545             if header[:1] in [',', ':']:
   617             if not len(header):
       
   618                 ret += "#\n"
       
   619             elif header[:1] in [',', ':']:
   546                 ret += '#%s\n' % header
   620                 ret += '#%s\n' % header
   547             else:
   621             else:
   548                 ret += '# %s\n' % header
   622                 ret += '# %s\n' % header
   549 
   623 
   550         if type(ret) != types.UnicodeType:
   624         if not isinstance(ret, text_type):
   551             ret = unicode(ret, self.encoding)
   625             ret = ret.decode(self.encoding)
   552 
   626 
   553         return ret + _BaseFile.__unicode__(self)
   627         return ret + _BaseFile.__unicode__(self)
   554 
   628 
   555     def save_as_mofile(self, fpath):
   629     def save_as_mofile(self, fpath):
   556         """
   630         """
   570         """
   644         """
   571         total = len([e for e in self if not e.obsolete])
   645         total = len([e for e in self if not e.obsolete])
   572         if total == 0:
   646         if total == 0:
   573             return 100
   647             return 100
   574         translated = len(self.translated_entries())
   648         translated = len(self.translated_entries())
   575         return int((100.00 / float(total)) * translated)
   649         return int(translated * 100 / float(total))
   576 
   650 
   577     def translated_entries(self):
   651     def translated_entries(self):
   578         """
   652         """
   579         Convenience method that returns the list of translated entries.
   653         Convenience method that returns the list of translated entries.
   580         """
   654         """
   582 
   656 
   583     def untranslated_entries(self):
   657     def untranslated_entries(self):
   584         """
   658         """
   585         Convenience method that returns the list of untranslated entries.
   659         Convenience method that returns the list of untranslated entries.
   586         """
   660         """
   587         return [e for e in self if not e.translated() and not e.obsolete \
   661         return [e for e in self if not e.translated() and not e.obsolete
   588                 and not 'fuzzy' in e.flags]
   662                 and not 'fuzzy' in e.flags]
   589 
   663 
   590     def fuzzy_entries(self):
   664     def fuzzy_entries(self):
   591         """
   665         """
   592         Convenience method that returns the list of fuzzy entries.
   666         Convenience method that returns the list of fuzzy entries.
   613         Keyword argument:
   687         Keyword argument:
   614 
   688 
   615         ``refpot``
   689         ``refpot``
   616             object POFile, the reference catalog.
   690             object POFile, the reference catalog.
   617         """
   691         """
       
   692         # Store entries in dict/set for faster access
       
   693         self_entries = dict((entry.msgid, entry) for entry in self)
       
   694         refpot_msgids = set(entry.msgid for entry in refpot)
       
   695         # Merge entries that are in the refpot
   618         for entry in refpot:
   696         for entry in refpot:
   619             e = self.find(entry.msgid, include_obsolete_entries=True)
   697             e = self_entries.get(entry.msgid)
   620             if e is None:
   698             if e is None:
   621                 e = POEntry()
   699                 e = POEntry()
   622                 self.append(e)
   700                 self.append(e)
   623             e.merge(entry)
   701             e.merge(entry)
   624         # ok, now we must "obsolete" entries that are not in the refpot anymore
   702         # ok, now we must "obsolete" entries that are not in the refpot anymore
   625         for entry in self:
   703         for entry in self:
   626             if refpot.find(entry.msgid) is None:
   704             if entry.msgid not in refpot_msgids:
   627                 entry.obsolete = True
   705                 entry.obsolete = True
   628 
       
   629 # }}}
   706 # }}}
   630 # class MOFile {{{
   707 # class MOFile {{{
       
   708 
   631 
   709 
   632 class MOFile(_BaseFile):
   710 class MOFile(_BaseFile):
   633     """
   711     """
   634     Mo file reader/writer.
   712     Mo file reader/writer.
   635     This class inherits the :class:`~polib._BaseFile` class and, by
   713     This class inherits the :class:`~polib._BaseFile` class and, by
   636     extension, the python ``list`` type.
   714     extension, the python ``list`` type.
   637     """
   715     """
   638     BIG_ENDIAN    = 0xde120495
   716     MAGIC = 0x950412de
   639     LITTLE_ENDIAN = 0x950412de
   717     MAGIC_SWAPPED = 0xde120495
   640 
   718 
   641     def __init__(self, *args, **kwargs):
   719     def __init__(self, *args, **kwargs):
   642         """
   720         """
   643         Constructor, accepts all keywords arguments accepted by
   721         Constructor, accepts all keywords arguments accepted by
   644         :class:`~polib._BaseFile` class.
   722         :class:`~polib._BaseFile` class.
   696     def obsolete_entries(self):
   774     def obsolete_entries(self):
   697         """
   775         """
   698         Convenience method to keep the same interface with POFile instances.
   776         Convenience method to keep the same interface with POFile instances.
   699         """
   777         """
   700         return []
   778         return []
   701 
       
   702 # }}}
   779 # }}}
   703 # class _BaseEntry {{{
   780 # class _BaseEntry {{{
       
   781 
   704 
   782 
   705 class _BaseEntry(object):
   783 class _BaseEntry(object):
   706     """
   784     """
   707     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
   785     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
   708     This class should **not** be instanciated directly.
   786     This class should **not** be instanciated directly.
   751         else:
   829         else:
   752             delflag = ''
   830             delflag = ''
   753         ret = []
   831         ret = []
   754         # write the msgctxt if any
   832         # write the msgctxt if any
   755         if self.msgctxt is not None:
   833         if self.msgctxt is not None:
   756             ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
   834             ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
       
   835                                    wrapwidth)
   757         # write the msgid
   836         # write the msgid
   758         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
   837         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
   759         # write the msgid_plural if any
   838         # write the msgid_plural if any
   760         if self.msgid_plural:
   839         if self.msgid_plural:
   761             ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth)
   840             ret += self._str_field("msgid_plural", delflag, "",
       
   841                                    self.msgid_plural, wrapwidth)
   762         if self.msgstr_plural:
   842         if self.msgstr_plural:
   763             # write the msgstr_plural if any
   843             # write the msgstr_plural if any
   764             msgstrs = self.msgstr_plural
   844             msgstrs = self.msgstr_plural
   765             keys = list(msgstrs)
   845             keys = list(msgstrs)
   766             keys.sort()
   846             keys.sort()
   767             for index in keys:
   847             for index in keys:
   768                 msgstr = msgstrs[index]
   848                 msgstr = msgstrs[index]
   769                 plural_index = '[%s]' % index
   849                 plural_index = '[%s]' % index
   770                 ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth)
   850                 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
       
   851                                        wrapwidth)
   771         else:
   852         else:
   772             # otherwise write the msgstr
   853             # otherwise write the msgstr
   773             ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
   854             ret += self._str_field("msgstr", delflag, "", self.msgstr,
       
   855                                    wrapwidth)
   774         ret.append('')
   856         ret.append('')
   775         ret = '\n'.join(ret)
   857         ret = u('\n').join(ret)
   776 
       
   777         if type(ret) != types.UnicodeType:
       
   778             return unicode(ret, self.encoding)
       
   779         return ret
   858         return ret
   780 
   859 
   781     def __str__(self):
   860     if PY3:
   782         """
   861         def __str__(self):
   783         Returns the string representation of the entry.
   862             return self.__unicode__()
   784         """
   863     else:
   785         return unicode(self).encode(self.encoding)
   864         def __str__(self):
       
   865             """
       
   866             Returns the string representation of the entry.
       
   867             """
       
   868             return unicode(self).encode(self.encoding)
   786 
   869 
   787     def __eq__(self, other):
   870     def __eq__(self, other):
   788         return unicode(self) == unicode(other)
   871         return str(self) == str(other)
   789 
   872 
   790     def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
   873     def _str_field(self, fieldname, delflag, plural_index, field,
       
   874                    wrapwidth=78):
   791         lines = field.splitlines(True)
   875         lines = field.splitlines(True)
   792         if len(lines) > 1:
   876         if len(lines) > 1:
   793             lines = [''] + lines # start with initial empty line
   877             lines = [''] + lines  # start with initial empty line
   794         else:
   878         else:
   795             escaped_field = escape(field)
   879             escaped_field = escape(field)
   796             specialchars_count = 0
   880             specialchars_count = 0
   797             for c in ['\\', '\n', '\r', '\t', '"']:
   881             for c in ['\\', '\n', '\r', '\t', '"']:
   798                 specialchars_count += field.count(c)
   882                 specialchars_count += field.count(c)
   802             if plural_index:
   886             if plural_index:
   803                 flength += len(plural_index)
   887                 flength += len(plural_index)
   804             real_wrapwidth = wrapwidth - flength + specialchars_count
   888             real_wrapwidth = wrapwidth - flength + specialchars_count
   805             if wrapwidth > 0 and len(field) > real_wrapwidth:
   889             if wrapwidth > 0 and len(field) > real_wrapwidth:
   806                 # Wrap the line but take field name into account
   890                 # Wrap the line but take field name into account
   807                 lines = [''] + [unescape(item) for item in textwrap.wrap(
   891                 lines = [''] + [unescape(item) for item in wrap(
   808                     escaped_field,
   892                     escaped_field,
   809                     wrapwidth - 2, # 2 for quotes ""
   893                     wrapwidth - 2,  # 2 for quotes ""
   810                     drop_whitespace=False,
   894                     drop_whitespace=False,
   811                     break_long_words=False
   895                     break_long_words=False
   812                 )]
   896                 )]
   813             else:
   897             else:
   814                 lines = [field]
   898                 lines = [field]
   816             # quick and dirty trick to get the real field name
   900             # quick and dirty trick to get the real field name
   817             fieldname = fieldname[9:]
   901             fieldname = fieldname[9:]
   818 
   902 
   819         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
   903         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
   820                                 escape(lines.pop(0)))]
   904                                 escape(lines.pop(0)))]
   821         for mstr in lines:
   905         for line in lines:
   822             ret.append('%s"%s"' % (delflag, escape(mstr)))
   906             ret.append('%s"%s"' % (delflag, escape(line)))
   823         return ret
   907         return ret
   824 
       
   825 # }}}
   908 # }}}
   826 # class POEntry {{{
   909 # class POEntry {{{
   827 
   910 
       
   911 
   828 class POEntry(_BaseEntry):
   912 class POEntry(_BaseEntry):
   829     """
   913     """
   830     Represents a po file entry.
   914     Represents a po file entry.
   831     """
   915     """
   832 
   916 
   852         ``previous_msgid``
   936         ``previous_msgid``
   853             string, the entry previous msgid.
   937             string, the entry previous msgid.
   854 
   938 
   855         ``previous_msgid_plural``
   939         ``previous_msgid_plural``
   856             string, the entry previous msgid_plural.
   940             string, the entry previous msgid_plural.
       
   941 
       
   942         ``linenum``
       
   943             integer, the line number of the entry
   857         """
   944         """
   858         _BaseEntry.__init__(self, *args, **kwargs)
   945         _BaseEntry.__init__(self, *args, **kwargs)
   859         self.comment = kwargs.get('comment', '')
   946         self.comment = kwargs.get('comment', '')
   860         self.tcomment = kwargs.get('tcomment', '')
   947         self.tcomment = kwargs.get('tcomment', '')
   861         self.occurrences = kwargs.get('occurrences', [])
   948         self.occurrences = kwargs.get('occurrences', [])
   862         self.flags = kwargs.get('flags', [])
   949         self.flags = kwargs.get('flags', [])
   863         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
   950         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
   864         self.previous_msgid = kwargs.get('previous_msgid', None)
   951         self.previous_msgid = kwargs.get('previous_msgid', None)
   865         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
   952         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
       
   953         self.linenum = kwargs.get('linenum', None)
   866 
   954 
   867     def __unicode__(self, wrapwidth=78):
   955     def __unicode__(self, wrapwidth=78):
   868         """
   956         """
   869         Returns the unicode representation of the entry.
   957         Returns the unicode representation of the entry.
   870         """
   958         """
   877         for c in comments:
   965         for c in comments:
   878             val = getattr(self, c[0])
   966             val = getattr(self, c[0])
   879             if val:
   967             if val:
   880                 for comment in val.split('\n'):
   968                 for comment in val.split('\n'):
   881                     if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
   969                     if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
   882                         ret += textwrap.wrap(
   970                         ret += wrap(
   883                             comment,
   971                             comment,
   884                             wrapwidth,
   972                             wrapwidth,
   885                             initial_indent=c[1],
   973                             initial_indent=c[1],
   886                             subsequent_indent=c[1],
   974                             subsequent_indent=c[1],
   887                             break_long_words=False
   975                             break_long_words=False
   901             if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
   989             if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
   902                 # textwrap split words that contain hyphen, this is not
   990                 # textwrap split words that contain hyphen, this is not
   903                 # what we want for filenames, so the dirty hack is to
   991                 # what we want for filenames, so the dirty hack is to
   904                 # temporally replace hyphens with a char that a file cannot
   992                 # temporally replace hyphens with a char that a file cannot
   905                 # contain, like "*"
   993                 # contain, like "*"
   906                 ret += [l.replace('*', '-') for l in textwrap.wrap(
   994                 ret += [l.replace('*', '-') for l in wrap(
   907                     filestr.replace('-', '*'),
   995                     filestr.replace('-', '*'),
   908                     wrapwidth,
   996                     wrapwidth,
   909                     initial_indent='#: ',
   997                     initial_indent='#: ',
   910                     subsequent_indent='#: ',
   998                     subsequent_indent='#: ',
   911                     break_long_words=False
   999                     break_long_words=False
   916         # flags (TODO: wrapping ?)
  1004         # flags (TODO: wrapping ?)
   917         if self.flags:
  1005         if self.flags:
   918             ret.append('#, %s' % ', '.join(self.flags))
  1006             ret.append('#, %s' % ', '.join(self.flags))
   919 
  1007 
   920         # previous context and previous msgid/msgid_plural
  1008         # previous context and previous msgid/msgid_plural
   921         fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
  1009         fields = ['previous_msgctxt', 'previous_msgid',
       
  1010                   'previous_msgid_plural']
   922         for f in fields:
  1011         for f in fields:
   923             val = getattr(self, f)
  1012             val = getattr(self, f)
   924             if val:
  1013             if val:
   925                 ret += self._str_field(f, "#| ", "", val, wrapwidth)
  1014                 ret += self._str_field(f, "#| ", "", val, wrapwidth)
   926 
  1015 
   927         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
  1016         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
   928         ret = '\n'.join(ret)
  1017         ret = u('\n').join(ret)
   929 
  1018 
   930         if type(ret) != types.UnicodeType:
  1019         assert isinstance(ret, text_type)
   931             return unicode(ret, self.encoding)
  1020         #if type(ret) != types.UnicodeType:
       
  1021         #    return unicode(ret, self.encoding)
   932         return ret
  1022         return ret
   933 
  1023 
   934     def __cmp__(self, other):
  1024     def __cmp__(self, other):
   935         """
  1025         """
   936         Called by comparison operations if rich comparison is not defined.
  1026         Called by comparison operations if rich comparison is not defined.
   937         """
  1027         """
   938         def compare_occurrences(a, b):
       
   939             """
       
   940             Compare an entry occurrence with another one.
       
   941             """
       
   942             if a[0] != b[0]:
       
   943                 return a[0] < b[0]
       
   944             if a[1] != b[1]:
       
   945                 return a[1] < b[1]
       
   946             return 0
       
   947 
  1028 
   948         # First: Obsolete test
  1029         # First: Obsolete test
   949         if self.obsolete != other.obsolete:
  1030         if self.obsolete != other.obsolete:
   950             if self.obsolete:
  1031             if self.obsolete:
   951                 return -1
  1032                 return -1
   952             else:
  1033             else:
   953                 return 1
  1034                 return 1
   954         # Work on a copy to protect original
  1035         # Work on a copy to protect original
   955         occ1 = self.occurrences[:]
  1036         occ1 = sorted(self.occurrences[:])
   956         occ2 = other.occurrences[:]
  1037         occ2 = sorted(other.occurrences[:])
   957         # Sorting using compare method
       
   958         occ1.sort(compare_occurrences)
       
   959         occ2.sort(compare_occurrences)
       
   960         # Comparing sorted occurrences
       
   961         pos = 0
  1038         pos = 0
   962         for entry1 in occ1:
  1039         for entry1 in occ1:
   963             try:
  1040             try:
   964                 entry2 = occ2[pos]
  1041                 entry2 = occ2[pos]
   965             except IndexError:
  1042             except IndexError:
   973             if entry1[1] != entry2[1]:
  1050             if entry1[1] != entry2[1]:
   974                 if entry1[1] > entry2[1]:
  1051                 if entry1[1] > entry2[1]:
   975                     return 1
  1052                     return 1
   976                 else:
  1053                 else:
   977                     return -1
  1054                     return -1
       
  1055         # Compare msgid_plural if set
       
  1056         if self.msgid_plural:
       
  1057             if not other.msgid_plural:
       
  1058                 return 1
       
  1059             for pos in self.msgid_plural:
       
  1060                 if pos not in other.msgid_plural:
       
  1061                     return 1
       
  1062                 if self.msgid_plural[pos] > other.msgid_plural[pos]:
       
  1063                     return 1
       
  1064                 if self.msgid_plural[pos] < other.msgid_plural[pos]:
       
  1065                     return -1
   978         # Finally: Compare message ID
  1066         # Finally: Compare message ID
   979         if self.msgid > other.msgid: return 1
  1067         if self.msgid > other.msgid:
   980         else: return -1
  1068             return 1
       
  1069         elif self.msgid < other.msgid:
       
  1070             return -1
       
  1071         return 0
       
  1072 
       
  1073     def __gt__(self, other):
       
  1074         return self.__cmp__(other) > 0
       
  1075 
       
  1076     def __lt__(self, other):
       
  1077         return self.__cmp__(other) < 0
       
  1078 
       
  1079     def __ge__(self, other):
       
  1080         return self.__cmp__(other) >= 0
       
  1081 
       
  1082     def __le__(self, other):
       
  1083         return self.__cmp__(other) <= 0
       
  1084 
       
  1085     def __eq__(self, other):
       
  1086         return self.__cmp__(other) == 0
       
  1087 
       
  1088     def __ne__(self, other):
       
  1089         return self.__cmp__(other) != 0
   981 
  1090 
   982     def translated(self):
  1091     def translated(self):
   983         """
  1092         """
   984         Returns ``True`` if the entry has been translated or ``False``
  1093         Returns ``True`` if the entry has been translated or ``False``
   985         otherwise.
  1094         otherwise.
  1018                     # keep existing translation at pos if any
  1127                     # keep existing translation at pos if any
  1019                     self.msgstr_plural[pos]
  1128                     self.msgstr_plural[pos]
  1020                 except KeyError:
  1129                 except KeyError:
  1021                     self.msgstr_plural[pos] = ''
  1130                     self.msgstr_plural[pos] = ''
  1022 
  1131 
       
  1132     def __hash__(self):
       
  1133         return hash((self.msgid, self.msgstr))
  1023 # }}}
  1134 # }}}
  1024 # class MOEntry {{{
  1135 # class MOEntry {{{
  1025 
  1136 
       
  1137 
  1026 class MOEntry(_BaseEntry):
  1138 class MOEntry(_BaseEntry):
  1027     """
  1139     """
  1028     Represents a mo file entry.
  1140     Represents a mo file entry.
  1029     """
  1141     """
  1030     pass
  1142     def __init__(self, *args, **kwargs):
       
  1143         """
       
  1144         Constructor, accepts the following keyword arguments,
       
  1145         for consistency with :class:`~polib.POEntry`:
       
  1146 
       
  1147         ``comment``
       
  1148         ``tcomment``
       
  1149         ``occurrences``
       
  1150         ``flags``
       
  1151         ``previous_msgctxt``
       
  1152         ``previous_msgid``
       
  1153         ``previous_msgid_plural``
       
  1154 
       
  1155         Note: even though these keyword arguments are accepted,
       
  1156         they hold no real meaning in the context of MO files
       
  1157         and are simply ignored.
       
  1158         """
       
  1159         _BaseEntry.__init__(self, *args, **kwargs)
       
  1160         self.comment = ''
       
  1161         self.tcomment = ''
       
  1162         self.occurrences = []
       
  1163         self.flags = []
       
  1164         self.previous_msgctxt = None
       
  1165         self.previous_msgid = None
       
  1166         self.previous_msgid_plural = None
       
  1167 
       
  1168     def __hash__(self):
       
  1169         return hash((self.msgid, self.msgstr))
  1031 
  1170 
  1032 # }}}
  1171 # }}}
  1033 # class _POFileParser {{{
  1172 # class _POFileParser {{{
       
  1173 
  1034 
  1174 
  1035 class _POFileParser(object):
  1175 class _POFileParser(object):
  1036     """
  1176     """
  1037     A finite state machine to parse efficiently and correctly po
  1177     A finite state machine to parse efficiently and correctly po
  1038     file format.
  1178     file format.
  1054         ``check_for_duplicates``
  1194         ``check_for_duplicates``
  1055             whether to check for duplicate entries when adding entries to the
  1195             whether to check for duplicate entries when adding entries to the
  1056             file (optional, default: ``False``).
  1196             file (optional, default: ``False``).
  1057         """
  1197         """
  1058         enc = kwargs.get('encoding', default_encoding)
  1198         enc = kwargs.get('encoding', default_encoding)
  1059         if os.path.exists(pofile):
  1199         if _is_file(pofile):
  1060             try:
  1200             try:
  1061                 self.fhandle = codecs.open(pofile, 'rU', enc)
  1201                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
  1062             except LookupError:
  1202             except LookupError:
  1063                 enc = default_encoding
  1203                 enc = default_encoding
  1064                 self.fhandle = codecs.open(pofile, 'rU', enc)
  1204                 self.fhandle = io.open(pofile, 'rt', encoding=enc)
  1065         else:
  1205         else:
  1066             self.fhandle = pofile.splitlines()
  1206             self.fhandle = pofile.splitlines()
  1067 
  1207 
  1068         self.instance = POFile(
  1208         klass = kwargs.get('klass')
       
  1209         if klass is None:
       
  1210             klass = POFile
       
  1211         self.instance = klass(
  1069             pofile=pofile,
  1212             pofile=pofile,
  1070             encoding=enc,
  1213             encoding=enc,
  1071             check_for_duplicates=kwargs.get('check_for_duplicates', False)
  1214             check_for_duplicates=kwargs.get('check_for_duplicates', False)
  1072         )
  1215         )
  1073         self.transitions = {}
  1216         self.transitions = {}
  1074         self.current_entry = POEntry()
  1217         self.current_line = 0
  1075         self.current_state = 'ST'
  1218         self.current_entry = POEntry(linenum=self.current_line)
       
  1219         self.current_state = 'st'
  1076         self.current_token = None
  1220         self.current_token = None
  1077         # two memo flags used in handlers
  1221         # two memo flags used in handlers
  1078         self.msgstr_index = 0
  1222         self.msgstr_index = 0
  1079         self.entry_obsolete = 0
  1223         self.entry_obsolete = 0
  1080         # Configure the state machine, by adding transitions.
  1224         # Configure the state machine, by adding transitions.
  1081         # Signification of symbols:
  1225         # Signification of symbols:
  1082         #     * ST: Beginning of the file (start)
  1226         #     * ST: Beginning of the file (start)
  1083         #     * HE: Header
  1227         #     * HE: Header
  1084         #     * TC: a translation comment
  1228         #     * TC: a translation comment
  1085         #     * GC: a generated comment
  1229         #     * GC: a generated comment
  1086         #     * OC: a file/line occurence
  1230         #     * OC: a file/line occurrence
  1087         #     * FL: a flags line
  1231         #     * FL: a flags line
  1088         #     * CT: a message context
  1232         #     * CT: a message context
  1089         #     * PC: a previous msgctxt
  1233         #     * PC: a previous msgctxt
  1090         #     * PM: a previous msgid
  1234         #     * PM: a previous msgid
  1091         #     * PP: a previous msgid_plural
  1235         #     * PP: a previous msgid_plural
  1092         #     * MI: a msgid
  1236         #     * MI: a msgid
  1093         #     * MP: a msgid plural
  1237         #     * MP: a msgid plural
  1094         #     * MS: a msgstr
  1238         #     * MS: a msgstr
  1095         #     * MX: a msgstr plural
  1239         #     * MX: a msgstr plural
  1096         #     * MC: a msgid or msgstr continuation line
  1240         #     * MC: a msgid or msgstr continuation line
  1097         all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC',
  1241         all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
  1098                'MS', 'MP', 'MX', 'MI']
  1242                'ms', 'mp', 'mx', 'mi']
  1099 
  1243 
  1100         self.add('TC', ['ST', 'HE'],                                     'HE')
  1244         self.add('tc', ['st', 'he'],                                     'he')
  1101         self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS',
  1245         self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
  1102                         'MP', 'MX', 'MI'],                               'TC')
  1246                         'mp', 'mx', 'mi'],                               'tc')
  1103         self.add('GC', all,                                              'GC')
  1247         self.add('gc', all,                                              'gc')
  1104         self.add('OC', all,                                              'OC')
  1248         self.add('oc', all,                                              'oc')
  1105         self.add('FL', all,                                              'FL')
  1249         self.add('fl', all,                                              'fl')
  1106         self.add('PC', all,                                              'PC')
  1250         self.add('pc', all,                                              'pc')
  1107         self.add('PM', all,                                              'PM')
  1251         self.add('pm', all,                                              'pm')
  1108         self.add('PP', all,                                              'PP')
  1252         self.add('pp', all,                                              'pp')
  1109         self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM',
  1253         self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
  1110                         'PP', 'MS', 'MX'],                               'CT')
  1254                         'pp', 'ms', 'mx'],                               'ct')
  1111         self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC',
  1255         self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
  1112                  'PM', 'PP', 'MS', 'MX'],                                'MI')
  1256                  'pm', 'pp', 'ms', 'mx'],                                'mi')
  1113         self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'],             'MP')
  1257         self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
  1114         self.add('MS', ['MI', 'MP', 'TC'],                               'MS')
  1258         self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
  1115         self.add('MX', ['MI', 'MX', 'MP', 'TC'],                         'MX')
  1259         self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
  1116         self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC')
  1260         self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
  1117 
  1261 
  1118     def parse(self):
  1262     def parse(self):
  1119         """
  1263         """
  1120         Run the state machine, parse the file line by line and call process()
  1264         Run the state machine, parse the file line by line and call process()
  1121         with the current matched symbol.
  1265         with the current matched symbol.
  1122         """
  1266         """
  1123         i = 0
       
  1124 
  1267 
  1125         keywords = {
  1268         keywords = {
  1126             'msgctxt': 'CT',
  1269             'msgctxt': 'ct',
  1127             'msgid': 'MI',
  1270             'msgid': 'mi',
  1128             'msgstr': 'MS',
  1271             'msgstr': 'ms',
  1129             'msgid_plural': 'MP',
  1272             'msgid_plural': 'mp',
  1130         }
  1273         }
  1131         prev_keywords = {
  1274         prev_keywords = {
  1132             'msgid_plural': 'PP',
  1275             'msgid_plural': 'pp',
  1133             'msgid': 'PM',
  1276             'msgid': 'pm',
  1134             'msgctxt': 'PC',
  1277             'msgctxt': 'pc',
  1135         }
  1278         }
  1136 
  1279         tokens = []
  1137         for line in self.fhandle:
  1280         for line in self.fhandle:
  1138             i += 1
  1281             self.current_line += 1
  1139             line = line.strip()
  1282             line = line.strip()
  1140             if line == '':
  1283             if line == '':
  1141                 continue
  1284                 continue
  1142 
  1285 
  1143             tokens = line.split(None, 2)
  1286             tokens = line.split(None, 2)
  1144             nb_tokens = len(tokens)
  1287             nb_tokens = len(tokens)
       
  1288 
       
  1289             if tokens[0] == '#~|':
       
  1290                 continue
  1145 
  1291 
  1146             if tokens[0] == '#~' and nb_tokens > 1:
  1292             if tokens[0] == '#~' and nb_tokens > 1:
  1147                 line = line[3:].strip()
  1293                 line = line[3:].strip()
  1148                 tokens = tokens[1:]
  1294                 tokens = tokens[1:]
  1149                 nb_tokens -= 1
  1295                 nb_tokens -= 1
  1153 
  1299 
  1154             # Take care of keywords like
  1300             # Take care of keywords like
  1155             # msgid, msgid_plural, msgctxt & msgstr.
  1301             # msgid, msgid_plural, msgctxt & msgstr.
  1156             if tokens[0] in keywords and nb_tokens > 1:
  1302             if tokens[0] in keywords and nb_tokens > 1:
  1157                 line = line[len(tokens[0]):].lstrip()
  1303                 line = line[len(tokens[0]):].lstrip()
       
  1304                 if re.search(r'([^\\]|^)"', line[1:-1]):
       
  1305                     raise IOError('Syntax error in po file %s (line %s): '
       
  1306                                   'unescaped double quote found' %
       
  1307                                   (self.instance.fpath, self.current_line))
  1158                 self.current_token = line
  1308                 self.current_token = line
  1159                 self.process(keywords[tokens[0]], i)
  1309                 self.process(keywords[tokens[0]])
  1160                 continue
  1310                 continue
  1161 
  1311 
  1162             self.current_token = line
  1312             self.current_token = line
  1163 
  1313 
  1164             if tokens[0] == '#:' and nb_tokens > 1:
  1314             if tokens[0] == '#:':
       
  1315                 if nb_tokens <= 1:
       
  1316                     continue
  1165                 # we are on a occurrences line
  1317                 # we are on a occurrences line
  1166                 self.process('OC', i)
  1318                 self.process('oc')
  1167 
  1319 
  1168             elif line[:1] == '"':
  1320             elif line[:1] == '"':
  1169                 # we are on a continuation line
  1321                 # we are on a continuation line
  1170                 self.process('MC', i)
  1322                 if re.search(r'([^\\]|^)"', line[1:-1]):
       
  1323                     raise IOError('Syntax error in po file %s (line %s): '
       
  1324                                   'unescaped double quote found' %
       
  1325                                   (self.instance.fpath, self.current_line))
       
  1326                 self.process('mc')
  1171 
  1327 
  1172             elif line[:7] == 'msgstr[':
  1328             elif line[:7] == 'msgstr[':
  1173                 # we are on a msgstr plural
  1329                 # we are on a msgstr plural
  1174                 self.process('MX', i)
  1330                 self.process('mx')
  1175 
  1331 
  1176             elif tokens[0] == '#,' and nb_tokens > 1:
  1332             elif tokens[0] == '#,':
       
  1333                 if nb_tokens <= 1:
       
  1334                     continue
  1177                 # we are on a flags line
  1335                 # we are on a flags line
  1178                 self.process('FL', i)
  1336                 self.process('fl')
  1179 
  1337 
  1180             elif tokens[0] == '#':
  1338             elif tokens[0] == '#' or tokens[0].startswith('##'):
  1181                 if line == '#': line += ' '
  1339                 if line == '#':
       
  1340                     line += ' '
  1182                 # we are on a translator comment line
  1341                 # we are on a translator comment line
  1183                 self.process('TC', i)
  1342                 self.process('tc')
  1184 
  1343 
  1185             elif tokens[0] == '#.' and nb_tokens > 1:
  1344             elif tokens[0] == '#.':
       
  1345                 if nb_tokens <= 1:
       
  1346                     continue
  1186                 # we are on a generated comment line
  1347                 # we are on a generated comment line
  1187                 self.process('GC', i)
  1348                 self.process('gc')
  1188 
  1349 
  1189             elif tokens[0] == '#|':
  1350             elif tokens[0] == '#|':
  1190                 if nb_tokens < 2:
  1351                 if nb_tokens <= 1:
  1191                     self.process('??', i)
  1352                     raise IOError('Syntax error in po file %s (line %s)' %
  1192                     continue
  1353                                   (self.instance.fpath, self.current_line))
  1193 
  1354 
  1194                 # Remove the marker and any whitespace right after that.
  1355                 # Remove the marker and any whitespace right after that.
  1195                 line = line[2:].lstrip()
  1356                 line = line[2:].lstrip()
  1196                 self.current_token = line
  1357                 self.current_token = line
  1197 
  1358 
  1198                 if tokens[1].startswith('"'):
  1359                 if tokens[1].startswith('"'):
  1199                     # Continuation of previous metadata.
  1360                     # Continuation of previous metadata.
  1200                     self.process('MC', i)
  1361                     self.process('mc')
  1201                     continue
  1362                     continue
  1202 
  1363 
  1203                 if nb_tokens == 2:
  1364                 if nb_tokens == 2:
  1204                     # Invalid continuation line.
  1365                     # Invalid continuation line.
  1205                     self.process('??', i)
  1366                     raise IOError('Syntax error in po file %s (line %s): '
       
  1367                                   'invalid continuation line' %
       
  1368                                   (self.instance.fpath, self.current_line))
  1206 
  1369 
  1207                 # we are on a "previous translation" comment line,
  1370                 # we are on a "previous translation" comment line,
  1208                 if tokens[1] not in prev_keywords:
  1371                 if tokens[1] not in prev_keywords:
  1209                     # Unknown keyword in previous translation comment.
  1372                     # Unknown keyword in previous translation comment.
  1210                     self.process('??', i)
  1373                     raise IOError('Syntax error in po file %s (line %s): '
       
  1374                                   'unknown keyword %s' %
       
  1375                                   (self.instance.fpath, self.current_line,
       
  1376                                    tokens[1]))
  1211 
  1377 
  1212                 # Remove the keyword and any whitespace
  1378                 # Remove the keyword and any whitespace
  1213                 # between it and the starting quote.
  1379                 # between it and the starting quote.
  1214                 line = line[len(tokens[1]):].lstrip()
  1380                 line = line[len(tokens[1]):].lstrip()
  1215                 self.current_token = line
  1381                 self.current_token = line
  1216                 self.process(prev_keywords[tokens[1]], i)
  1382                 self.process(prev_keywords[tokens[1]])
  1217 
  1383 
  1218             else:
  1384             else:
  1219                 self.process('??', i)
  1385                 raise IOError('Syntax error in po file %s (line %s)' %
  1220 
  1386                               (self.instance.fpath, self.current_line))
  1221         if self.current_entry:
  1387 
       
  1388         if self.current_entry and len(tokens) > 0 and \
       
  1389            not tokens[0].startswith('#'):
  1222             # since entries are added when another entry is found, we must add
  1390             # since entries are added when another entry is found, we must add
  1223             # the last entry here (only if there are lines)
  1391             # the last entry here (only if there are lines). Trailing comments
       
  1392             # are ignored
  1224             self.instance.append(self.current_entry)
  1393             self.instance.append(self.current_entry)
       
  1394 
  1225         # before returning the instance, check if there's metadata and if
  1395         # before returning the instance, check if there's metadata and if
  1226         # so extract it in a dict
  1396         # so extract it in a dict
  1227         firstentry = self.instance[0]
  1397         metadataentry = self.instance.find('')
  1228         if firstentry.msgid == '': # metadata found
  1398         if metadataentry:  # metadata found
  1229             # remove the entry
  1399             # remove the entry
  1230             firstentry = self.instance.pop(0)
  1400             self.instance.remove(metadataentry)
  1231             self.instance.metadata_is_fuzzy = firstentry.flags
  1401             self.instance.metadata_is_fuzzy = metadataentry.flags
  1232             key = None
  1402             key = None
  1233             for msg in firstentry.msgstr.splitlines():
  1403             for msg in metadataentry.msgstr.splitlines():
  1234                 try:
  1404                 try:
  1235                     key, val = msg.split(':', 1)
  1405                     key, val = msg.split(':', 1)
  1236                     self.instance.metadata[key] = val.strip()
  1406                     self.instance.metadata[key] = val.strip()
  1237                 except:
  1407                 except (ValueError, KeyError):
  1238                     if key is not None:
  1408                     if key is not None:
  1239                         self.instance.metadata[key] += '\n'+ msg.strip()
  1409                         self.instance.metadata[key] += '\n' + msg.strip()
  1240         # close opened file
  1410         # close opened file
  1241         if isinstance(self.fhandle, file):
  1411         if not isinstance(self.fhandle, list):  # must be file
  1242             self.fhandle.close()
  1412             self.fhandle.close()
  1243         return self.instance
  1413         return self.instance
  1244 
  1414 
  1245     def add(self, symbol, states, next_state):
  1415     def add(self, symbol, states, next_state):
  1246         """
  1416         """
  1256 
  1426 
  1257         ``next_state``
  1427         ``next_state``
  1258             the next state the fsm will have after the action.
  1428             the next state the fsm will have after the action.
  1259         """
  1429         """
  1260         for state in states:
  1430         for state in states:
  1261             action = getattr(self, 'handle_%s' % next_state.lower())
  1431             action = getattr(self, 'handle_%s' % next_state)
  1262             self.transitions[(symbol, state)] = (action, next_state)
  1432             self.transitions[(symbol, state)] = (action, next_state)
  1263 
  1433 
  1264     def process(self, symbol, linenum):
  1434     def process(self, symbol):
  1265         """
  1435         """
  1266         Process the transition corresponding to the current state and the
  1436         Process the transition corresponding to the current state and the
  1267         symbol provided.
  1437         symbol provided.
  1268 
  1438 
  1269         Keywords arguments:
  1439         Keywords arguments:
  1276         """
  1446         """
  1277         try:
  1447         try:
  1278             (action, state) = self.transitions[(symbol, self.current_state)]
  1448             (action, state) = self.transitions[(symbol, self.current_state)]
  1279             if action():
  1449             if action():
  1280                 self.current_state = state
  1450                 self.current_state = state
  1281         except Exception as exc:
  1451         except Exception:
  1282             raise IOError('Syntax error in po file (line %s)' % linenum)
  1452             raise IOError('Syntax error in po file (line %s)' %
       
  1453                           self.current_line)
  1283 
  1454 
  1284     # state handlers
  1455     # state handlers
  1285 
  1456 
  1286     def handle_he(self):
  1457     def handle_he(self):
  1287         """Handle a header comment."""
  1458         """Handle a header comment."""
  1290         self.instance.header += self.current_token[2:]
  1461         self.instance.header += self.current_token[2:]
  1291         return 1
  1462         return 1
  1292 
  1463 
  1293     def handle_tc(self):
  1464     def handle_tc(self):
  1294         """Handle a translator comment."""
  1465         """Handle a translator comment."""
  1295         if self.current_state in ['MC', 'MS', 'MX']:
  1466         if self.current_state in ['mc', 'ms', 'mx']:
  1296             self.instance.append(self.current_entry)
  1467             self.instance.append(self.current_entry)
  1297             self.current_entry = POEntry()
  1468             self.current_entry = POEntry(linenum=self.current_line)
  1298         if self.current_entry.tcomment != '':
  1469         if self.current_entry.tcomment != '':
  1299             self.current_entry.tcomment += '\n'
  1470             self.current_entry.tcomment += '\n'
  1300         self.current_entry.tcomment += self.current_token[2:]
  1471         tcomment = self.current_token.lstrip('#')
       
  1472         if tcomment.startswith(' '):
       
  1473             tcomment = tcomment[1:]
       
  1474         self.current_entry.tcomment += tcomment
  1301         return True
  1475         return True
  1302 
  1476 
  1303     def handle_gc(self):
  1477     def handle_gc(self):
  1304         """Handle a generated comment."""
  1478         """Handle a generated comment."""
  1305         if self.current_state in ['MC', 'MS', 'MX']:
  1479         if self.current_state in ['mc', 'ms', 'mx']:
  1306             self.instance.append(self.current_entry)
  1480             self.instance.append(self.current_entry)
  1307             self.current_entry = POEntry()
  1481             self.current_entry = POEntry(linenum=self.current_line)
  1308         if self.current_entry.comment != '':
  1482         if self.current_entry.comment != '':
  1309             self.current_entry.comment += '\n'
  1483             self.current_entry.comment += '\n'
  1310         self.current_entry.comment += self.current_token[3:]
  1484         self.current_entry.comment += self.current_token[3:]
  1311         return True
  1485         return True
  1312 
  1486 
  1313     def handle_oc(self):
  1487     def handle_oc(self):
  1314         """Handle a file:num occurence."""
  1488         """Handle a file:num occurrence."""
  1315         if self.current_state in ['MC', 'MS', 'MX']:
  1489         if self.current_state in ['mc', 'ms', 'mx']:
  1316             self.instance.append(self.current_entry)
  1490             self.instance.append(self.current_entry)
  1317             self.current_entry = POEntry()
  1491             self.current_entry = POEntry(linenum=self.current_line)
  1318         occurrences = self.current_token[3:].split()
  1492         occurrences = self.current_token[3:].split()
  1319         for occurrence in occurrences:
  1493         for occurrence in occurrences:
  1320             if occurrence != '':
  1494             if occurrence != '':
  1321                 try:
  1495                 try:
  1322                     fil, line = occurrence.split(':')
  1496                     fil, line = occurrence.split(':')
  1323                     if not line.isdigit():
  1497                     if not line.isdigit():
  1324                         fil  = fil + line
  1498                         fil = fil + line
  1325                         line = ''
  1499                         line = ''
  1326                     self.current_entry.occurrences.append((fil, line))
  1500                     self.current_entry.occurrences.append((fil, line))
  1327                 except:
  1501                 except (ValueError, AttributeError):
  1328                     self.current_entry.occurrences.append((occurrence, ''))
  1502                     self.current_entry.occurrences.append((occurrence, ''))
  1329         return True
  1503         return True
  1330 
  1504 
  1331     def handle_fl(self):
  1505     def handle_fl(self):
  1332         """Handle a flags line."""
  1506         """Handle a flags line."""
  1333         if self.current_state in ['MC', 'MS', 'MX']:
  1507         if self.current_state in ['mc', 'ms', 'mx']:
  1334             self.instance.append(self.current_entry)
  1508             self.instance.append(self.current_entry)
  1335             self.current_entry = POEntry()
  1509             self.current_entry = POEntry(linenum=self.current_line)
  1336         self.current_entry.flags += self.current_token[3:].split(', ')
  1510         self.current_entry.flags += [c.strip() for c in
       
  1511                                      self.current_token[3:].split(',')]
  1337         return True
  1512         return True
  1338 
  1513 
  1339     def handle_pp(self):
  1514     def handle_pp(self):
  1340         """Handle a previous msgid_plural line."""
  1515         """Handle a previous msgid_plural line."""
  1341         if self.current_state in ['MC', 'MS', 'MX']:
  1516         if self.current_state in ['mc', 'ms', 'mx']:
  1342             self.instance.append(self.current_entry)
  1517             self.instance.append(self.current_entry)
  1343             self.current_entry = POEntry()
  1518             self.current_entry = POEntry(linenum=self.current_line)
  1344         self.current_entry.previous_msgid_plural = \
  1519         self.current_entry.previous_msgid_plural = \
  1345             unescape(self.current_token[1:-1])
  1520             unescape(self.current_token[1:-1])
  1346         return True
  1521         return True
  1347 
  1522 
  1348     def handle_pm(self):
  1523     def handle_pm(self):
  1349         """Handle a previous msgid line."""
  1524         """Handle a previous msgid line."""
  1350         if self.current_state in ['MC', 'MS', 'MX']:
  1525         if self.current_state in ['mc', 'ms', 'mx']:
  1351             self.instance.append(self.current_entry)
  1526             self.instance.append(self.current_entry)
  1352             self.current_entry = POEntry()
  1527             self.current_entry = POEntry(linenum=self.current_line)
  1353         self.current_entry.previous_msgid = \
  1528         self.current_entry.previous_msgid = \
  1354             unescape(self.current_token[1:-1])
  1529             unescape(self.current_token[1:-1])
  1355         return True
  1530         return True
  1356 
  1531 
  1357     def handle_pc(self):
  1532     def handle_pc(self):
  1358         """Handle a previous msgctxt line."""
  1533         """Handle a previous msgctxt line."""
  1359         if self.current_state in ['MC', 'MS', 'MX']:
  1534         if self.current_state in ['mc', 'ms', 'mx']:
  1360             self.instance.append(self.current_entry)
  1535             self.instance.append(self.current_entry)
  1361             self.current_entry = POEntry()
  1536             self.current_entry = POEntry(linenum=self.current_line)
  1362         self.current_entry.previous_msgctxt = \
  1537         self.current_entry.previous_msgctxt = \
  1363             unescape(self.current_token[1:-1])
  1538             unescape(self.current_token[1:-1])
  1364         return True
  1539         return True
  1365 
  1540 
  1366     def handle_ct(self):
  1541     def handle_ct(self):
  1367         """Handle a msgctxt."""
  1542         """Handle a msgctxt."""
  1368         if self.current_state in ['MC', 'MS', 'MX']:
  1543         if self.current_state in ['mc', 'ms', 'mx']:
  1369             self.instance.append(self.current_entry)
  1544             self.instance.append(self.current_entry)
  1370             self.current_entry = POEntry()
  1545             self.current_entry = POEntry(linenum=self.current_line)
  1371         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
  1546         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
  1372         return True
  1547         return True
  1373 
  1548 
  1374     def handle_mi(self):
  1549     def handle_mi(self):
  1375         """Handle a msgid."""
  1550         """Handle a msgid."""
  1376         if self.current_state in ['MC', 'MS', 'MX']:
  1551         if self.current_state in ['mc', 'ms', 'mx']:
  1377             self.instance.append(self.current_entry)
  1552             self.instance.append(self.current_entry)
  1378             self.current_entry = POEntry()
  1553             self.current_entry = POEntry(linenum=self.current_line)
  1379         self.current_entry.obsolete = self.entry_obsolete
  1554         self.current_entry.obsolete = self.entry_obsolete
  1380         self.current_entry.msgid = unescape(self.current_token[1:-1])
  1555         self.current_entry.msgid = unescape(self.current_token[1:-1])
  1381         return True
  1556         return True
  1382 
  1557 
  1383     def handle_mp(self):
  1558     def handle_mp(self):
  1390         self.current_entry.msgstr = unescape(self.current_token[1:-1])
  1565         self.current_entry.msgstr = unescape(self.current_token[1:-1])
  1391         return True
  1566         return True
  1392 
  1567 
  1393     def handle_mx(self):
  1568     def handle_mx(self):
  1394         """Handle a msgstr plural."""
  1569         """Handle a msgstr plural."""
  1395         index, value = self.current_token[7], self.current_token[11:-1]
  1570         index = self.current_token[7]
  1396         self.current_entry.msgstr_plural[index] = unescape(value)
  1571         value = self.current_token[self.current_token.find('"') + 1:-1]
  1397         self.msgstr_index = index
  1572         self.current_entry.msgstr_plural[int(index)] = unescape(value)
       
  1573         self.msgstr_index = int(index)
  1398         return True
  1574         return True
  1399 
  1575 
  1400     def handle_mc(self):
  1576     def handle_mc(self):
  1401         """Handle a msgid or msgstr continuation line."""
  1577         """Handle a msgid or msgstr continuation line."""
  1402         token = unescape(self.current_token[1:-1])
  1578         token = unescape(self.current_token[1:-1])
  1403         if self.current_state == 'CT':
  1579         if self.current_state == 'ct':
  1404             typ = 'msgctxt'
       
  1405             self.current_entry.msgctxt += token
  1580             self.current_entry.msgctxt += token
  1406         elif self.current_state == 'MI':
  1581         elif self.current_state == 'mi':
  1407             typ = 'msgid'
       
  1408             self.current_entry.msgid += token
  1582             self.current_entry.msgid += token
  1409         elif self.current_state == 'MP':
  1583         elif self.current_state == 'mp':
  1410             typ = 'msgid_plural'
       
  1411             self.current_entry.msgid_plural += token
  1584             self.current_entry.msgid_plural += token
  1412         elif self.current_state == 'MS':
  1585         elif self.current_state == 'ms':
  1413             typ = 'msgstr'
       
  1414             self.current_entry.msgstr += token
  1586             self.current_entry.msgstr += token
  1415         elif self.current_state == 'MX':
  1587         elif self.current_state == 'mx':
  1416             typ = 'msgstr[%s]' % self.msgstr_index
       
  1417             self.current_entry.msgstr_plural[self.msgstr_index] += token
  1588             self.current_entry.msgstr_plural[self.msgstr_index] += token
  1418         elif self.current_state == 'PP':
  1589         elif self.current_state == 'pp':
  1419             typ = 'previous_msgid_plural'
       
  1420             token = token[3:]
       
  1421             self.current_entry.previous_msgid_plural += token
  1590             self.current_entry.previous_msgid_plural += token
  1422         elif self.current_state == 'PM':
  1591         elif self.current_state == 'pm':
  1423             typ = 'previous_msgid'
       
  1424             token = token[3:]
       
  1425             self.current_entry.previous_msgid += token
  1592             self.current_entry.previous_msgid += token
  1426         elif self.current_state == 'PC':
  1593         elif self.current_state == 'pc':
  1427             typ = 'previous_msgctxt'
       
  1428             token = token[3:]
       
  1429             self.current_entry.previous_msgctxt += token
  1594             self.current_entry.previous_msgctxt += token
  1430         # don't change the current state
  1595         # don't change the current state
  1431         return False
  1596         return False
  1432 
       
  1433 # }}}
  1597 # }}}
  1434 # class _MOFileParser {{{
  1598 # class _MOFileParser {{{
       
  1599 
  1435 
  1600 
  1436 class _MOFileParser(object):
  1601 class _MOFileParser(object):
  1437     """
  1602     """
  1438     A class to parse binary mo files.
  1603     A class to parse binary mo files.
  1439     """
  1604     """
  1454         ``check_for_duplicates``
  1619         ``check_for_duplicates``
  1455             whether to check for duplicate entries when adding entries to the
  1620             whether to check for duplicate entries when adding entries to the
  1456             file (optional, default: ``False``).
  1621             file (optional, default: ``False``).
  1457         """
  1622         """
  1458         self.fhandle = open(mofile, 'rb')
  1623         self.fhandle = open(mofile, 'rb')
  1459         self.instance = MOFile(
  1624 
       
  1625         klass = kwargs.get('klass')
       
  1626         if klass is None:
       
  1627             klass = MOFile
       
  1628         self.instance = klass(
  1460             fpath=mofile,
  1629             fpath=mofile,
  1461             encoding=kwargs.get('encoding', default_encoding),
  1630             encoding=kwargs.get('encoding', default_encoding),
  1462             check_for_duplicates=kwargs.get('check_for_duplicates', False)
  1631             check_for_duplicates=kwargs.get('check_for_duplicates', False)
  1463         )
  1632         )
  1464 
  1633 
       
  1634     def __del__(self):
       
  1635         """
       
  1636         Make sure the file is closed, this prevents warnings on unclosed file
       
  1637         when running tests with python >= 3.2.
       
  1638         """
       
  1639         if self.fhandle:
       
  1640             self.fhandle.close()
       
  1641 
  1465     def parse(self):
  1642     def parse(self):
  1466         """
  1643         """
  1467         Build the instance with the file handle provided in the
  1644         Build the instance with the file handle provided in the
  1468         constructor.
  1645         constructor.
  1469         """
  1646         """
  1470         # parse magic number
  1647         # parse magic number
  1471         magic_number = self._readbinary('<I', 4)
  1648         magic_number = self._readbinary('<I', 4)
  1472         if magic_number == MOFile.LITTLE_ENDIAN:
  1649         if magic_number == MOFile.MAGIC:
  1473             ii = '<II'
  1650             ii = '<II'
  1474         elif magic_number == MOFile.BIG_ENDIAN:
  1651         elif magic_number == MOFile.MAGIC_SWAPPED:
  1475             ii = '>II'
  1652             ii = '>II'
  1476         else:
  1653         else:
  1477             raise IOError('Invalid mo file, magic number is incorrect !')
  1654             raise IOError('Invalid mo file, magic number is incorrect !')
  1478         self.instance.magic_number = magic_number
  1655         self.instance.magic_number = magic_number
  1479         # parse the version number and the number of strings
  1656         # parse the version number and the number of strings
  1480         self.instance.version, numofstrings = self._readbinary(ii, 8)
  1657         version, numofstrings = self._readbinary(ii, 8)
       
  1658         # from MO file format specs: "A program seeing an unexpected major
       
  1659         # revision number should stop reading the MO file entirely"
       
  1660         if version not in (0, 1):
       
  1661             raise IOError('Invalid mo file, unexpected major revision number')
       
  1662         self.instance.version = version
  1481         # original strings and translation strings hash table offset
  1663         # original strings and translation strings hash table offset
  1482         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
  1664         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
  1483         # move to msgid hash table and read length and offset of msgids
  1665         # move to msgid hash table and read length and offset of msgids
  1484         self.fhandle.seek(msgids_hash_offset)
  1666         self.fhandle.seek(msgids_hash_offset)
  1485         msgids_index = []
  1667         msgids_index = []
  1489         self.fhandle.seek(msgstrs_hash_offset)
  1671         self.fhandle.seek(msgstrs_hash_offset)
  1490         msgstrs_index = []
  1672         msgstrs_index = []
  1491         for i in range(numofstrings):
  1673         for i in range(numofstrings):
  1492             msgstrs_index.append(self._readbinary(ii, 8))
  1674             msgstrs_index.append(self._readbinary(ii, 8))
  1493         # build entries
  1675         # build entries
       
  1676         encoding = self.instance.encoding
  1494         for i in range(numofstrings):
  1677         for i in range(numofstrings):
  1495             self.fhandle.seek(msgids_index[i][1])
  1678             self.fhandle.seek(msgids_index[i][1])
  1496             msgid = self.fhandle.read(msgids_index[i][0])
  1679             msgid = self.fhandle.read(msgids_index[i][0])
       
  1680 
  1497             self.fhandle.seek(msgstrs_index[i][1])
  1681             self.fhandle.seek(msgstrs_index[i][1])
  1498             msgstr = self.fhandle.read(msgstrs_index[i][0])
  1682             msgstr = self.fhandle.read(msgstrs_index[i][0])
  1499             if i == 0: # metadata
  1683             if i == 0 and not msgid:  # metadata
  1500                 raw_metadata, metadata = msgstr.split('\n'), {}
  1684                 raw_metadata, metadata = msgstr.split(b('\n')), {}
  1501                 for line in raw_metadata:
  1685                 for line in raw_metadata:
  1502                     tokens = line.split(':', 1)
  1686                     tokens = line.split(b(':'), 1)
  1503                     if tokens[0] != '':
  1687                     if tokens[0] != b(''):
  1504                         try:
  1688                         try:
  1505                             metadata[tokens[0]] = tokens[1].strip()
  1689                             k = tokens[0].decode(encoding)
       
  1690                             v = tokens[1].decode(encoding)
       
  1691                             metadata[k] = v.strip()
  1506                         except IndexError:
  1692                         except IndexError:
  1507                             metadata[tokens[0]] = ''
  1693                             metadata[k] = u('')
  1508                 self.instance.metadata = metadata
  1694                 self.instance.metadata = metadata
  1509                 continue
  1695                 continue
  1510             # test if we have a plural entry
  1696             # test if we have a plural entry
  1511             msgid_tokens = msgid.split('\0')
  1697             msgid_tokens = msgid.split(b('\0'))
  1512             if len(msgid_tokens) > 1:
  1698             if len(msgid_tokens) > 1:
  1513                 entry = self._build_entry(
  1699                 entry = self._build_entry(
  1514                     msgid=msgid_tokens[0],
  1700                     msgid=msgid_tokens[0],
  1515                     msgid_plural=msgid_tokens[1],
  1701                     msgid_plural=msgid_tokens[1],
  1516                     msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0')))
  1702                     msgstr_plural=dict((k, v) for k, v in
       
  1703                                        enumerate(msgstr.split(b('\0'))))
  1517                 )
  1704                 )
  1518             else:
  1705             else:
  1519                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
  1706                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
  1520             self.instance.append(entry)
  1707             self.instance.append(entry)
  1521         # close opened file
  1708         # close opened file
  1522         self.fhandle.close()
  1709         self.fhandle.close()
  1523         return self.instance
  1710         return self.instance
  1524 
  1711 
  1525     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
  1712     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
  1526                      msgstr_plural=None):
  1713                      msgstr_plural=None):
  1527         msgctxt_msgid = msgid.split('\x04')
  1714         msgctxt_msgid = msgid.split(b('\x04'))
       
  1715         encoding = self.instance.encoding
  1528         if len(msgctxt_msgid) > 1:
  1716         if len(msgctxt_msgid) > 1:
  1529             kwargs = {
  1717             kwargs = {
  1530                 'msgctxt': msgctxt_msgid[0],
  1718                 'msgctxt': msgctxt_msgid[0].decode(encoding),
  1531                 'msgid'  : msgctxt_msgid[1],
  1719                 'msgid': msgctxt_msgid[1].decode(encoding),
  1532             }
  1720             }
  1533         else:
  1721         else:
  1534             kwargs = {'msgid': msgid}
  1722             kwargs = {'msgid': msgid.decode(encoding)}
  1535         if msgstr:
  1723         if msgstr:
  1536             kwargs['msgstr'] = msgstr
  1724             kwargs['msgstr'] = msgstr.decode(encoding)
  1537         if msgid_plural:
  1725         if msgid_plural:
  1538             kwargs['msgid_plural'] = msgid_plural
  1726             kwargs['msgid_plural'] = msgid_plural.decode(encoding)
  1539         if msgstr_plural:
  1727         if msgstr_plural:
       
  1728             for k in msgstr_plural:
       
  1729                 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
  1540             kwargs['msgstr_plural'] = msgstr_plural
  1730             kwargs['msgstr_plural'] = msgstr_plural
  1541         return MOEntry(**kwargs)
  1731         return MOEntry(**kwargs)
  1542 
  1732 
  1543     def _readbinary(self, fmt, numbytes):
  1733     def _readbinary(self, fmt, numbytes):
  1544         """
  1734         """
  1548         bytes = self.fhandle.read(numbytes)
  1738         bytes = self.fhandle.read(numbytes)
  1549         tup = struct.unpack(fmt, bytes)
  1739         tup = struct.unpack(fmt, bytes)
  1550         if len(tup) == 1:
  1740         if len(tup) == 1:
  1551             return tup[0]
  1741             return tup[0]
  1552         return tup
  1742         return tup
  1553 
       
  1554 # }}}
  1743 # }}}
       
  1744 # class TextWrapper {{{
       
  1745 
       
  1746 
       
  1747 class TextWrapper(textwrap.TextWrapper):
       
  1748     """
       
  1749     Subclass of textwrap.TextWrapper that backport the
       
  1750     drop_whitespace option.
       
  1751     """
       
  1752     def __init__(self, *args, **kwargs):
       
  1753         drop_whitespace = kwargs.pop('drop_whitespace', True)
       
  1754         textwrap.TextWrapper.__init__(self, *args, **kwargs)
       
  1755         self.drop_whitespace = drop_whitespace
       
  1756 
       
  1757     def _wrap_chunks(self, chunks):
       
  1758         """_wrap_chunks(chunks : [string]) -> [string]
       
  1759 
       
  1760         Wrap a sequence of text chunks and return a list of lines of
       
  1761         length 'self.width' or less.  (If 'break_long_words' is false,
       
  1762         some lines may be longer than this.)  Chunks correspond roughly
       
  1763         to words and the whitespace between them: each chunk is
       
  1764         indivisible (modulo 'break_long_words'), but a line break can
       
  1765         come between any two chunks.  Chunks should not have internal
       
  1766         whitespace; ie. a chunk is either all whitespace or a "word".
       
  1767         Whitespace chunks will be removed from the beginning and end of
       
  1768         lines, but apart from that whitespace is preserved.
       
  1769         """
       
  1770         lines = []
       
  1771         if self.width <= 0:
       
  1772             raise ValueError("invalid width %r (must be > 0)" % self.width)
       
  1773 
       
  1774         # Arrange in reverse order so items can be efficiently popped
       
  1775         # from a stack of chucks.
       
  1776         chunks.reverse()
       
  1777 
       
  1778         while chunks:
       
  1779 
       
  1780             # Start the list of chunks that will make up the current line.
       
  1781             # cur_len is just the length of all the chunks in cur_line.
       
  1782             cur_line = []
       
  1783             cur_len = 0
       
  1784 
       
  1785             # Figure out which static string will prefix this line.
       
  1786             if lines:
       
  1787                 indent = self.subsequent_indent
       
  1788             else:
       
  1789                 indent = self.initial_indent
       
  1790 
       
  1791             # Maximum width for this line.
       
  1792             width = self.width - len(indent)
       
  1793 
       
  1794             # First chunk on line is whitespace -- drop it, unless this
       
  1795             # is the very beginning of the text (ie. no lines started yet).
       
  1796             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
       
  1797                 del chunks[-1]
       
  1798 
       
  1799             while chunks:
       
  1800                 l = len(chunks[-1])
       
  1801 
       
  1802                 # Can at least squeeze this chunk onto the current line.
       
  1803                 if cur_len + l <= width:
       
  1804                     cur_line.append(chunks.pop())
       
  1805                     cur_len += l
       
  1806 
       
  1807                 # Nope, this line is full.
       
  1808                 else:
       
  1809                     break
       
  1810 
       
  1811             # The current line is full, and the next chunk is too big to
       
  1812             # fit on *any* line (not just this one).
       
  1813             if chunks and len(chunks[-1]) > width:
       
  1814                 self._handle_long_word(chunks, cur_line, cur_len, width)
       
  1815 
       
  1816             # If the last chunk on this line is all whitespace, drop it.
       
  1817             if self.drop_whitespace and cur_line and not cur_line[-1].strip():
       
  1818                 del cur_line[-1]
       
  1819 
       
  1820             # Convert current line back to a string and store it in list
       
  1821             # of all lines (return value).
       
  1822             if cur_line:
       
  1823                 lines.append(indent + ''.join(cur_line))
       
  1824 
       
  1825         return lines
       
  1826 # }}}
       
  1827 # function wrap() {{{
       
  1828 
       
  1829 
       
  1830 def wrap(text, width=70, **kwargs):
       
  1831     """
       
  1832     Wrap a single paragraph of text, returning a list of wrapped lines.
       
  1833     """
       
  1834     if sys.version_info < (2, 6):
       
  1835         return TextWrapper(width=width, **kwargs).wrap(text)
       
  1836     return textwrap.wrap(text, width=width, **kwargs)
       
  1837 
       
  1838 # }}}