13 :func:`~polib.mofile` convenience functions. |
13 :func:`~polib.mofile` convenience functions. |
14 """ |
14 """ |
15 |
15 |
16 from __future__ import absolute_import |
16 from __future__ import absolute_import |
17 |
17 |
18 __author__ = 'David Jean Louis <izimobil@gmail.com>' |
18 __author__ = 'David Jean Louis <izimobil@gmail.com>' |
19 __version__ = '0.6.4' |
19 __version__ = '1.0.7' |
20 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', |
20 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', |
21 'detect_encoding', 'escape', 'unescape', 'detect_encoding',] |
21 'default_encoding', 'escape', 'unescape', 'detect_encoding', ] |
22 |
22 |
23 import array |
23 import array |
24 import codecs |
24 import codecs |
25 import os |
25 import os |
26 import re |
26 import re |
27 import struct |
27 import struct |
28 import sys |
28 import sys |
29 import textwrap |
29 import textwrap |
30 import types |
30 |
|
31 try: |
|
32 import io |
|
33 except ImportError: |
|
34 # replacement of io.open() for python < 2.6 |
|
35 # we use codecs instead |
|
36 class io(object): |
|
37 @staticmethod |
|
38 def open(fpath, mode='r', encoding=None): |
|
39 return codecs.open(fpath, mode, encoding) |
31 |
40 |
32 |
41 |
33 # the default encoding to use when encoding cannot be detected |
42 # the default encoding to use when encoding cannot be detected |
34 default_encoding = 'utf-8' |
43 default_encoding = 'utf-8' |
35 |
44 |
|
45 # python 2/3 compatibility helpers {{{ |
|
46 |
|
47 |
|
48 if sys.version_info[:2] < (3, 0): |
|
49 PY3 = False |
|
50 text_type = unicode |
|
51 |
|
52 def b(s): |
|
53 return s |
|
54 |
|
55 def u(s): |
|
56 return unicode(s, "unicode_escape") |
|
57 |
|
58 else: |
|
59 PY3 = True |
|
60 text_type = str |
|
61 |
|
62 def b(s): |
|
63 return s.encode("latin-1") |
|
64 |
|
65 def u(s): |
|
66 return s |
|
67 # }}} |
36 # _pofile_or_mofile {{{ |
68 # _pofile_or_mofile {{{ |
|
69 |
37 |
70 |
38 def _pofile_or_mofile(f, type, **kwargs): |
71 def _pofile_or_mofile(f, type, **kwargs): |
39 """ |
72 """ |
40 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to |
73 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to |
41 honor the DRY concept. |
74 honor the DRY concept. |
127 string, full or relative path to the po/mo file or its content. |
189 string, full or relative path to the po/mo file or its content. |
128 |
190 |
129 ``binary_mode`` |
191 ``binary_mode`` |
130 boolean, set this to True if ``file`` is a mo file. |
192 boolean, set this to True if ``file`` is a mo file. |
131 """ |
193 """ |
132 rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') |
194 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)' |
|
195 rxt = re.compile(u(PATTERN)) |
|
196 rxb = re.compile(b(PATTERN)) |
133 |
197 |
134 def charset_exists(charset): |
198 def charset_exists(charset): |
135 """Check whether ``charset`` is valid or not.""" |
199 """Check whether ``charset`` is valid or not.""" |
136 try: |
200 try: |
137 codecs.lookup(charset) |
201 codecs.lookup(charset) |
138 except LookupError: |
202 except LookupError: |
139 return False |
203 return False |
140 return True |
204 return True |
141 |
205 |
142 if not os.path.exists(file): |
206 if not _is_file(file): |
143 match = rx.search(file) |
207 match = rxt.search(file) |
144 if match: |
208 if match: |
145 enc = match.group(1).strip() |
209 enc = match.group(1).strip() |
146 if charset_exists(enc): |
210 if charset_exists(enc): |
147 return enc |
211 return enc |
148 else: |
212 else: |
149 if binary_mode: |
213 # For PY3, always treat as binary |
|
214 if binary_mode or PY3: |
150 mode = 'rb' |
215 mode = 'rb' |
|
216 rx = rxb |
151 else: |
217 else: |
152 mode = 'r' |
218 mode = 'r' |
|
219 rx = rxt |
153 f = open(file, mode) |
220 f = open(file, mode) |
154 for l in f.readlines(): |
221 for l in f.readlines(): |
155 match = rx.search(l) |
222 match = rx.search(l) |
156 if match: |
223 if match: |
157 f.close() |
224 f.close() |
158 enc = match.group(1).strip() |
225 enc = match.group(1).strip() |
|
226 if not isinstance(enc, text_type): |
|
227 enc = enc.decode('utf-8') |
159 if charset_exists(enc): |
228 if charset_exists(enc): |
160 return enc |
229 return enc |
161 f.close() |
230 f.close() |
162 return default_encoding |
231 return default_encoding |
163 |
|
164 # }}} |
232 # }}} |
165 # function escape() {{{ |
233 # function escape() {{{ |
|
234 |
166 |
235 |
167 def escape(st): |
236 def escape(st): |
168 """ |
237 """ |
169 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in |
238 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in |
170 the given string ``st`` and returns it. |
239 the given string ``st`` and returns it. |
252 [e for e in self if not e.obsolete] |
321 [e for e in self if not e.obsolete] |
253 for entry in entries: |
322 for entry in entries: |
254 ret.append(entry.__unicode__(self.wrapwidth)) |
323 ret.append(entry.__unicode__(self.wrapwidth)) |
255 for entry in self.obsolete_entries(): |
324 for entry in self.obsolete_entries(): |
256 ret.append(entry.__unicode__(self.wrapwidth)) |
325 ret.append(entry.__unicode__(self.wrapwidth)) |
257 ret = '\n'.join(ret) |
326 ret = u('\n').join(ret) |
258 |
327 |
259 if type(ret) != types.UnicodeType: |
328 assert isinstance(ret, text_type) |
260 return unicode(ret, self.encoding) |
329 #if type(ret) != text_type: |
|
330 # return unicode(ret, self.encoding) |
261 return ret |
331 return ret |
262 |
332 |
263 def __str__(self): |
333 if PY3: |
264 """ |
334 def __str__(self): |
265 Returns the string representation of the file. |
335 return self.__unicode__() |
266 """ |
336 else: |
267 return unicode(self).encode(self.encoding) |
337 def __str__(self): |
|
338 """ |
|
339 Returns the string representation of the file. |
|
340 """ |
|
341 return unicode(self).encode(self.encoding) |
268 |
342 |
269 def __contains__(self, entry): |
343 def __contains__(self, entry): |
270 """ |
344 """ |
271 Overriden ``list`` method to implement the membership test (in and |
345 Overridden ``list`` method to implement the membership test (in and |
272 not in). |
346 not in). |
273 The method considers that an entry is in the file if it finds an entry |
347 The method considers that an entry is in the file if it finds an entry |
274 that has the same msgid (the test is **case sensitive**). |
348 that has the same msgid (the test is **case sensitive**) and the same |
|
349 msgctxt (or none for both entries). |
275 |
350 |
276 Argument: |
351 Argument: |
277 |
352 |
278 ``entry`` |
353 ``entry`` |
279 an instance of :class:`~polib._BaseEntry`. |
354 an instance of :class:`~polib._BaseEntry`. |
280 """ |
355 """ |
281 return self.find(entry.msgid, by='msgid') is not None |
356 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \ |
|
357 is not None |
282 |
358 |
283 def __eq__(self, other): |
359 def __eq__(self, other): |
284 return unicode(self) == unicode(other) |
360 return str(self) == str(other) |
285 |
361 |
286 def append(self, entry): |
362 def append(self, entry): |
287 """ |
363 """ |
288 Overriden method to check for duplicates entries, if a user tries to |
364 Overridden method to check for duplicates entries, if a user tries to |
289 add an entry that is already in the file, the method will raise a |
365 add an entry that is already in the file, the method will raise a |
290 ``ValueError`` exception. |
366 ``ValueError`` exception. |
291 |
367 |
292 Argument: |
368 Argument: |
293 |
369 |
410 'PO-Revision-Date', |
486 'PO-Revision-Date', |
411 'Last-Translator', |
487 'Last-Translator', |
412 'Language-Team', |
488 'Language-Team', |
413 'MIME-Version', |
489 'MIME-Version', |
414 'Content-Type', |
490 'Content-Type', |
415 'Content-Transfer-Encoding' |
491 'Content-Transfer-Encoding', |
|
492 'Language', |
|
493 'Plural-Forms' |
416 ] |
494 ] |
417 ordered_data = [] |
495 ordered_data = [] |
418 for data in data_order: |
496 for data in data_order: |
419 try: |
497 try: |
420 value = metadata.pop(data) |
498 value = metadata.pop(data) |
421 ordered_data.append((data, value)) |
499 ordered_data.append((data, value)) |
422 except KeyError: |
500 except KeyError: |
423 pass |
501 pass |
424 # the rest of the metadata will be alphabetically ordered since there |
502 # the rest of the metadata will be alphabetically ordered since there |
425 # are no specs for this AFAIK |
503 # are no specs for this AFAIK |
426 keys = metadata.keys() |
504 for data in sorted(metadata.keys()): |
427 keys.sort() |
|
428 for data in keys: |
|
429 value = metadata[data] |
505 value = metadata[data] |
430 ordered_data.append((data, value)) |
506 ordered_data.append((data, value)) |
431 return ordered_data |
507 return ordered_data |
432 |
508 |
433 def to_binary(self): |
509 def to_binary(self): |
434 """ |
510 """ |
435 Return the binary representation of the file. |
511 Return the binary representation of the file. |
436 """ |
512 """ |
437 offsets = [] |
513 offsets = [] |
438 entries = self.translated_entries() |
514 entries = self.translated_entries() |
|
515 |
439 # the keys are sorted in the .mo file |
516 # the keys are sorted in the .mo file |
440 def cmp(_self, other): |
517 def cmp(_self, other): |
441 # msgfmt compares entries with msgctxt if it exists |
518 # msgfmt compares entries with msgctxt if it exists |
442 if _self.msgctxt: |
519 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid |
443 self_msgid = _self.msgctxt |
520 other_msgid = other.msgctxt and other.msgctxt or other.msgid |
444 else: |
|
445 self_msgid = _self.msgid |
|
446 |
|
447 if other.msgctxt: |
|
448 other_msgid = other.msgctxt |
|
449 else: |
|
450 other_msgid = other.msgid |
|
451 if self_msgid > other_msgid: |
521 if self_msgid > other_msgid: |
452 return 1 |
522 return 1 |
453 elif self_msgid < other_msgid: |
523 elif self_msgid < other_msgid: |
454 return -1 |
524 return -1 |
455 else: |
525 else: |
456 return 0 |
526 return 0 |
457 # add metadata entry |
527 # add metadata entry |
458 entries.sort(cmp) |
528 entries.sort(key=lambda o: o.msgctxt or o.msgid) |
459 mentry = self.metadata_as_entry() |
529 mentry = self.metadata_as_entry() |
460 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() |
530 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() |
461 entries = [mentry] + entries |
531 entries = [mentry] + entries |
462 entries_len = len(entries) |
532 entries_len = len(entries) |
463 ids, strs = '', '' |
533 ids, strs = b(''), b('') |
464 for e in entries: |
534 for e in entries: |
465 # For each string, we need size and file offset. Each string is |
535 # For each string, we need size and file offset. Each string is |
466 # NUL terminated; the NUL does not count into the size. |
536 # NUL terminated; the NUL does not count into the size. |
467 msgid = '' |
537 msgid = b('') |
468 if e.msgctxt: |
538 if e.msgctxt: |
469 # Contexts are stored by storing the concatenation of the |
539 # Contexts are stored by storing the concatenation of the |
470 # context, a <EOT> byte, and the original string |
540 # context, a <EOT> byte, and the original string |
471 msgid = self._encode(e.msgctxt + '\4') |
541 msgid = self._encode(e.msgctxt + '\4') |
472 if e.msgid_plural: |
542 if e.msgid_plural: |
473 indexes = e.msgstr_plural.keys() |
|
474 indexes.sort() |
|
475 msgstr = [] |
543 msgstr = [] |
476 for index in indexes: |
544 for index in sorted(e.msgstr_plural.keys()): |
477 msgstr.append(e.msgstr_plural[index]) |
545 msgstr.append(e.msgstr_plural[index]) |
478 msgid += self._encode(e.msgid + '\0' + e.msgid_plural) |
546 msgid += self._encode(e.msgid + '\0' + e.msgid_plural) |
479 msgstr = self._encode('\0'.join(msgstr)) |
547 msgstr = self._encode('\0'.join(msgstr)) |
480 else: |
548 else: |
481 msgid += self._encode(e.msgid) |
549 msgid += self._encode(e.msgid) |
482 msgstr = self._encode(e.msgstr) |
550 msgstr = self._encode(e.msgstr) |
483 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) |
551 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) |
484 ids += msgid + '\0' |
552 ids += msgid + b('\0') |
485 strs += msgstr + '\0' |
553 strs += msgstr + b('\0') |
486 |
554 |
487 # The header is 7 32-bit unsigned integers. |
555 # The header is 7 32-bit unsigned integers. |
488 keystart = 7*4+16*entries_len |
556 keystart = 7 * 4 + 16 * entries_len |
489 # and the values start after the keys |
557 # and the values start after the keys |
490 valuestart = keystart + len(ids) |
558 valuestart = keystart + len(ids) |
491 koffsets = [] |
559 koffsets = [] |
492 voffsets = [] |
560 voffsets = [] |
493 # The string table first has the list of keys, then the list of values. |
561 # The string table first has the list of keys, then the list of values. |
494 # Each entry has first the size of the string, then the file offset. |
562 # Each entry has first the size of the string, then the file offset. |
495 for o1, l1, o2, l2 in offsets: |
563 for o1, l1, o2, l2 in offsets: |
496 koffsets += [l1, o1+keystart] |
564 koffsets += [l1, o1 + keystart] |
497 voffsets += [l2, o2+valuestart] |
565 voffsets += [l2, o2 + valuestart] |
498 offsets = koffsets + voffsets |
566 offsets = koffsets + voffsets |
499 # check endianness for magic number |
|
500 if struct.pack('@h', 1) == struct.pack('<h', 1): |
|
501 magic_number = MOFile.LITTLE_ENDIAN |
|
502 else: |
|
503 magic_number = MOFile.BIG_ENDIAN |
|
504 |
567 |
505 output = struct.pack( |
568 output = struct.pack( |
506 "Iiiiiii", |
569 "Iiiiiii", |
507 magic_number, # Magic number |
570 # Magic number |
508 0, # Version |
571 MOFile.MAGIC, |
509 entries_len, # # of entries |
572 # Version |
510 7*4, # start of key index |
573 0, |
511 7*4+entries_len*8, # start of value index |
574 # number of entries |
512 0, keystart # size and offset of hash table |
575 entries_len, |
513 # Important: we don't use hash tables |
576 # start of key index |
|
577 7 * 4, |
|
578 # start of value index |
|
579 7 * 4 + entries_len * 8, |
|
580 # size and offset of hash table, we don't use hash tables |
|
581 0, keystart |
|
582 |
514 ) |
583 ) |
515 output += array.array("i", offsets).tostring() |
584 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior |
|
585 output += array.array("i", offsets).tobytes() |
|
586 else: |
|
587 output += array.array("i", offsets).tostring() |
516 output += ids |
588 output += ids |
517 output += strs |
589 output += strs |
518 return output |
590 return output |
519 |
591 |
520 def _encode(self, mixed): |
592 def _encode(self, mixed): |
521 """ |
593 """ |
522 Encodes the given ``mixed`` argument with the file encoding if and |
594 Encodes the given ``mixed`` argument with the file encoding if and |
523 only if it's an unicode string and returns the encoded string. |
595 only if it's an unicode string and returns the encoded string. |
524 """ |
596 """ |
525 if type(mixed) == types.UnicodeType: |
597 if isinstance(mixed, text_type): |
526 return mixed.encode(self.encoding) |
598 mixed = mixed.encode(self.encoding) |
527 return mixed |
599 return mixed |
528 |
|
529 # }}} |
600 # }}} |
530 # class POFile {{{ |
601 # class POFile {{{ |
|
602 |
531 |
603 |
532 class POFile(_BaseFile): |
604 class POFile(_BaseFile): |
533 """ |
605 """ |
534 Po (or Pot) file reader/writer. |
606 Po (or Pot) file reader/writer. |
535 This class inherits the :class:`~polib._BaseFile` class and, by extension, |
607 This class inherits the :class:`~polib._BaseFile` class and, by extension, |
751 else: |
829 else: |
752 delflag = '' |
830 delflag = '' |
753 ret = [] |
831 ret = [] |
754 # write the msgctxt if any |
832 # write the msgctxt if any |
755 if self.msgctxt is not None: |
833 if self.msgctxt is not None: |
756 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth) |
834 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, |
|
835 wrapwidth) |
757 # write the msgid |
836 # write the msgid |
758 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) |
837 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) |
759 # write the msgid_plural if any |
838 # write the msgid_plural if any |
760 if self.msgid_plural: |
839 if self.msgid_plural: |
761 ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth) |
840 ret += self._str_field("msgid_plural", delflag, "", |
|
841 self.msgid_plural, wrapwidth) |
762 if self.msgstr_plural: |
842 if self.msgstr_plural: |
763 # write the msgstr_plural if any |
843 # write the msgstr_plural if any |
764 msgstrs = self.msgstr_plural |
844 msgstrs = self.msgstr_plural |
765 keys = list(msgstrs) |
845 keys = list(msgstrs) |
766 keys.sort() |
846 keys.sort() |
767 for index in keys: |
847 for index in keys: |
768 msgstr = msgstrs[index] |
848 msgstr = msgstrs[index] |
769 plural_index = '[%s]' % index |
849 plural_index = '[%s]' % index |
770 ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth) |
850 ret += self._str_field("msgstr", delflag, plural_index, msgstr, |
|
851 wrapwidth) |
771 else: |
852 else: |
772 # otherwise write the msgstr |
853 # otherwise write the msgstr |
773 ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth) |
854 ret += self._str_field("msgstr", delflag, "", self.msgstr, |
|
855 wrapwidth) |
774 ret.append('') |
856 ret.append('') |
775 ret = '\n'.join(ret) |
857 ret = u('\n').join(ret) |
776 |
|
777 if type(ret) != types.UnicodeType: |
|
778 return unicode(ret, self.encoding) |
|
779 return ret |
858 return ret |
780 |
859 |
781 def __str__(self): |
860 if PY3: |
782 """ |
861 def __str__(self): |
783 Returns the string representation of the entry. |
862 return self.__unicode__() |
784 """ |
863 else: |
785 return unicode(self).encode(self.encoding) |
864 def __str__(self): |
|
865 """ |
|
866 Returns the string representation of the entry. |
|
867 """ |
|
868 return unicode(self).encode(self.encoding) |
786 |
869 |
787 def __eq__(self, other): |
870 def __eq__(self, other): |
788 return unicode(self) == unicode(other) |
871 return str(self) == str(other) |
789 |
872 |
790 def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78): |
873 def _str_field(self, fieldname, delflag, plural_index, field, |
|
874 wrapwidth=78): |
791 lines = field.splitlines(True) |
875 lines = field.splitlines(True) |
792 if len(lines) > 1: |
876 if len(lines) > 1: |
793 lines = [''] + lines # start with initial empty line |
877 lines = [''] + lines # start with initial empty line |
794 else: |
878 else: |
795 escaped_field = escape(field) |
879 escaped_field = escape(field) |
796 specialchars_count = 0 |
880 specialchars_count = 0 |
797 for c in ['\\', '\n', '\r', '\t', '"']: |
881 for c in ['\\', '\n', '\r', '\t', '"']: |
798 specialchars_count += field.count(c) |
882 specialchars_count += field.count(c) |
852 ``previous_msgid`` |
936 ``previous_msgid`` |
853 string, the entry previous msgid. |
937 string, the entry previous msgid. |
854 |
938 |
855 ``previous_msgid_plural`` |
939 ``previous_msgid_plural`` |
856 string, the entry previous msgid_plural. |
940 string, the entry previous msgid_plural. |
|
941 |
|
942 ``linenum`` |
|
943 integer, the line number of the entry |
857 """ |
944 """ |
858 _BaseEntry.__init__(self, *args, **kwargs) |
945 _BaseEntry.__init__(self, *args, **kwargs) |
859 self.comment = kwargs.get('comment', '') |
946 self.comment = kwargs.get('comment', '') |
860 self.tcomment = kwargs.get('tcomment', '') |
947 self.tcomment = kwargs.get('tcomment', '') |
861 self.occurrences = kwargs.get('occurrences', []) |
948 self.occurrences = kwargs.get('occurrences', []) |
862 self.flags = kwargs.get('flags', []) |
949 self.flags = kwargs.get('flags', []) |
863 self.previous_msgctxt = kwargs.get('previous_msgctxt', None) |
950 self.previous_msgctxt = kwargs.get('previous_msgctxt', None) |
864 self.previous_msgid = kwargs.get('previous_msgid', None) |
951 self.previous_msgid = kwargs.get('previous_msgid', None) |
865 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) |
952 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) |
|
953 self.linenum = kwargs.get('linenum', None) |
866 |
954 |
867 def __unicode__(self, wrapwidth=78): |
955 def __unicode__(self, wrapwidth=78): |
868 """ |
956 """ |
869 Returns the unicode representation of the entry. |
957 Returns the unicode representation of the entry. |
870 """ |
958 """ |
916 # flags (TODO: wrapping ?) |
1004 # flags (TODO: wrapping ?) |
917 if self.flags: |
1005 if self.flags: |
918 ret.append('#, %s' % ', '.join(self.flags)) |
1006 ret.append('#, %s' % ', '.join(self.flags)) |
919 |
1007 |
920 # previous context and previous msgid/msgid_plural |
1008 # previous context and previous msgid/msgid_plural |
921 fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural'] |
1009 fields = ['previous_msgctxt', 'previous_msgid', |
|
1010 'previous_msgid_plural'] |
922 for f in fields: |
1011 for f in fields: |
923 val = getattr(self, f) |
1012 val = getattr(self, f) |
924 if val: |
1013 if val: |
925 ret += self._str_field(f, "#| ", "", val, wrapwidth) |
1014 ret += self._str_field(f, "#| ", "", val, wrapwidth) |
926 |
1015 |
927 ret.append(_BaseEntry.__unicode__(self, wrapwidth)) |
1016 ret.append(_BaseEntry.__unicode__(self, wrapwidth)) |
928 ret = '\n'.join(ret) |
1017 ret = u('\n').join(ret) |
929 |
1018 |
930 if type(ret) != types.UnicodeType: |
1019 assert isinstance(ret, text_type) |
931 return unicode(ret, self.encoding) |
1020 #if type(ret) != types.UnicodeType: |
|
1021 # return unicode(ret, self.encoding) |
932 return ret |
1022 return ret |
933 |
1023 |
934 def __cmp__(self, other): |
1024 def __cmp__(self, other): |
935 """ |
1025 """ |
936 Called by comparison operations if rich comparison is not defined. |
1026 Called by comparison operations if rich comparison is not defined. |
937 """ |
1027 """ |
938 def compare_occurrences(a, b): |
|
939 """ |
|
940 Compare an entry occurrence with another one. |
|
941 """ |
|
942 if a[0] != b[0]: |
|
943 return a[0] < b[0] |
|
944 if a[1] != b[1]: |
|
945 return a[1] < b[1] |
|
946 return 0 |
|
947 |
1028 |
948 # First: Obsolete test |
1029 # First: Obsolete test |
949 if self.obsolete != other.obsolete: |
1030 if self.obsolete != other.obsolete: |
950 if self.obsolete: |
1031 if self.obsolete: |
951 return -1 |
1032 return -1 |
952 else: |
1033 else: |
953 return 1 |
1034 return 1 |
954 # Work on a copy to protect original |
1035 # Work on a copy to protect original |
955 occ1 = self.occurrences[:] |
1036 occ1 = sorted(self.occurrences[:]) |
956 occ2 = other.occurrences[:] |
1037 occ2 = sorted(other.occurrences[:]) |
957 # Sorting using compare method |
|
958 occ1.sort(compare_occurrences) |
|
959 occ2.sort(compare_occurrences) |
|
960 # Comparing sorted occurrences |
|
961 pos = 0 |
1038 pos = 0 |
962 for entry1 in occ1: |
1039 for entry1 in occ1: |
963 try: |
1040 try: |
964 entry2 = occ2[pos] |
1041 entry2 = occ2[pos] |
965 except IndexError: |
1042 except IndexError: |
1054 ``check_for_duplicates`` |
1194 ``check_for_duplicates`` |
1055 whether to check for duplicate entries when adding entries to the |
1195 whether to check for duplicate entries when adding entries to the |
1056 file (optional, default: ``False``). |
1196 file (optional, default: ``False``). |
1057 """ |
1197 """ |
1058 enc = kwargs.get('encoding', default_encoding) |
1198 enc = kwargs.get('encoding', default_encoding) |
1059 if os.path.exists(pofile): |
1199 if _is_file(pofile): |
1060 try: |
1200 try: |
1061 self.fhandle = codecs.open(pofile, 'rU', enc) |
1201 self.fhandle = io.open(pofile, 'rt', encoding=enc) |
1062 except LookupError: |
1202 except LookupError: |
1063 enc = default_encoding |
1203 enc = default_encoding |
1064 self.fhandle = codecs.open(pofile, 'rU', enc) |
1204 self.fhandle = io.open(pofile, 'rt', encoding=enc) |
1065 else: |
1205 else: |
1066 self.fhandle = pofile.splitlines() |
1206 self.fhandle = pofile.splitlines() |
1067 |
1207 |
1068 self.instance = POFile( |
1208 klass = kwargs.get('klass') |
|
1209 if klass is None: |
|
1210 klass = POFile |
|
1211 self.instance = klass( |
1069 pofile=pofile, |
1212 pofile=pofile, |
1070 encoding=enc, |
1213 encoding=enc, |
1071 check_for_duplicates=kwargs.get('check_for_duplicates', False) |
1214 check_for_duplicates=kwargs.get('check_for_duplicates', False) |
1072 ) |
1215 ) |
1073 self.transitions = {} |
1216 self.transitions = {} |
1074 self.current_entry = POEntry() |
1217 self.current_line = 0 |
1075 self.current_state = 'ST' |
1218 self.current_entry = POEntry(linenum=self.current_line) |
|
1219 self.current_state = 'st' |
1076 self.current_token = None |
1220 self.current_token = None |
1077 # two memo flags used in handlers |
1221 # two memo flags used in handlers |
1078 self.msgstr_index = 0 |
1222 self.msgstr_index = 0 |
1079 self.entry_obsolete = 0 |
1223 self.entry_obsolete = 0 |
1080 # Configure the state machine, by adding transitions. |
1224 # Configure the state machine, by adding transitions. |
1081 # Signification of symbols: |
1225 # Signification of symbols: |
1082 # * ST: Beginning of the file (start) |
1226 # * ST: Beginning of the file (start) |
1083 # * HE: Header |
1227 # * HE: Header |
1084 # * TC: a translation comment |
1228 # * TC: a translation comment |
1085 # * GC: a generated comment |
1229 # * GC: a generated comment |
1086 # * OC: a file/line occurence |
1230 # * OC: a file/line occurrence |
1087 # * FL: a flags line |
1231 # * FL: a flags line |
1088 # * CT: a message context |
1232 # * CT: a message context |
1089 # * PC: a previous msgctxt |
1233 # * PC: a previous msgctxt |
1090 # * PM: a previous msgid |
1234 # * PM: a previous msgid |
1091 # * PP: a previous msgid_plural |
1235 # * PP: a previous msgid_plural |
1092 # * MI: a msgid |
1236 # * MI: a msgid |
1093 # * MP: a msgid plural |
1237 # * MP: a msgid plural |
1094 # * MS: a msgstr |
1238 # * MS: a msgstr |
1095 # * MX: a msgstr plural |
1239 # * MX: a msgstr plural |
1096 # * MC: a msgid or msgstr continuation line |
1240 # * MC: a msgid or msgstr continuation line |
1097 all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC', |
1241 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc', |
1098 'MS', 'MP', 'MX', 'MI'] |
1242 'ms', 'mp', 'mx', 'mi'] |
1099 |
1243 |
1100 self.add('TC', ['ST', 'HE'], 'HE') |
1244 self.add('tc', ['st', 'he'], 'he') |
1101 self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS', |
1245 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms', |
1102 'MP', 'MX', 'MI'], 'TC') |
1246 'mp', 'mx', 'mi'], 'tc') |
1103 self.add('GC', all, 'GC') |
1247 self.add('gc', all, 'gc') |
1104 self.add('OC', all, 'OC') |
1248 self.add('oc', all, 'oc') |
1105 self.add('FL', all, 'FL') |
1249 self.add('fl', all, 'fl') |
1106 self.add('PC', all, 'PC') |
1250 self.add('pc', all, 'pc') |
1107 self.add('PM', all, 'PM') |
1251 self.add('pm', all, 'pm') |
1108 self.add('PP', all, 'PP') |
1252 self.add('pp', all, 'pp') |
1109 self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM', |
1253 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm', |
1110 'PP', 'MS', 'MX'], 'CT') |
1254 'pp', 'ms', 'mx'], 'ct') |
1111 self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', |
1255 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc', |
1112 'PM', 'PP', 'MS', 'MX'], 'MI') |
1256 'pm', 'pp', 'ms', 'mx'], 'mi') |
1113 self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP') |
1257 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp') |
1114 self.add('MS', ['MI', 'MP', 'TC'], 'MS') |
1258 self.add('ms', ['mi', 'mp', 'tc'], 'ms') |
1115 self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX') |
1259 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx') |
1116 self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC') |
1260 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc') |
1117 |
1261 |
1118 def parse(self): |
1262 def parse(self): |
1119 """ |
1263 """ |
1120 Run the state machine, parse the file line by line and call process() |
1264 Run the state machine, parse the file line by line and call process() |
1121 with the current matched symbol. |
1265 with the current matched symbol. |
1122 """ |
1266 """ |
1123 i = 0 |
|
1124 |
1267 |
1125 keywords = { |
1268 keywords = { |
1126 'msgctxt': 'CT', |
1269 'msgctxt': 'ct', |
1127 'msgid': 'MI', |
1270 'msgid': 'mi', |
1128 'msgstr': 'MS', |
1271 'msgstr': 'ms', |
1129 'msgid_plural': 'MP', |
1272 'msgid_plural': 'mp', |
1130 } |
1273 } |
1131 prev_keywords = { |
1274 prev_keywords = { |
1132 'msgid_plural': 'PP', |
1275 'msgid_plural': 'pp', |
1133 'msgid': 'PM', |
1276 'msgid': 'pm', |
1134 'msgctxt': 'PC', |
1277 'msgctxt': 'pc', |
1135 } |
1278 } |
1136 |
1279 tokens = [] |
1137 for line in self.fhandle: |
1280 for line in self.fhandle: |
1138 i += 1 |
1281 self.current_line += 1 |
1139 line = line.strip() |
1282 line = line.strip() |
1140 if line == '': |
1283 if line == '': |
1141 continue |
1284 continue |
1142 |
1285 |
1143 tokens = line.split(None, 2) |
1286 tokens = line.split(None, 2) |
1144 nb_tokens = len(tokens) |
1287 nb_tokens = len(tokens) |
|
1288 |
|
1289 if tokens[0] == '#~|': |
|
1290 continue |
1145 |
1291 |
1146 if tokens[0] == '#~' and nb_tokens > 1: |
1292 if tokens[0] == '#~' and nb_tokens > 1: |
1147 line = line[3:].strip() |
1293 line = line[3:].strip() |
1148 tokens = tokens[1:] |
1294 tokens = tokens[1:] |
1149 nb_tokens -= 1 |
1295 nb_tokens -= 1 |
1153 |
1299 |
1154 # Take care of keywords like |
1300 # Take care of keywords like |
1155 # msgid, msgid_plural, msgctxt & msgstr. |
1301 # msgid, msgid_plural, msgctxt & msgstr. |
1156 if tokens[0] in keywords and nb_tokens > 1: |
1302 if tokens[0] in keywords and nb_tokens > 1: |
1157 line = line[len(tokens[0]):].lstrip() |
1303 line = line[len(tokens[0]):].lstrip() |
|
1304 if re.search(r'([^\\]|^)"', line[1:-1]): |
|
1305 raise IOError('Syntax error in po file %s (line %s): ' |
|
1306 'unescaped double quote found' % |
|
1307 (self.instance.fpath, self.current_line)) |
1158 self.current_token = line |
1308 self.current_token = line |
1159 self.process(keywords[tokens[0]], i) |
1309 self.process(keywords[tokens[0]]) |
1160 continue |
1310 continue |
1161 |
1311 |
1162 self.current_token = line |
1312 self.current_token = line |
1163 |
1313 |
1164 if tokens[0] == '#:' and nb_tokens > 1: |
1314 if tokens[0] == '#:': |
|
1315 if nb_tokens <= 1: |
|
1316 continue |
1165 # we are on a occurrences line |
1317 # we are on a occurrences line |
1166 self.process('OC', i) |
1318 self.process('oc') |
1167 |
1319 |
1168 elif line[:1] == '"': |
1320 elif line[:1] == '"': |
1169 # we are on a continuation line |
1321 # we are on a continuation line |
1170 self.process('MC', i) |
1322 if re.search(r'([^\\]|^)"', line[1:-1]): |
|
1323 raise IOError('Syntax error in po file %s (line %s): ' |
|
1324 'unescaped double quote found' % |
|
1325 (self.instance.fpath, self.current_line)) |
|
1326 self.process('mc') |
1171 |
1327 |
1172 elif line[:7] == 'msgstr[': |
1328 elif line[:7] == 'msgstr[': |
1173 # we are on a msgstr plural |
1329 # we are on a msgstr plural |
1174 self.process('MX', i) |
1330 self.process('mx') |
1175 |
1331 |
1176 elif tokens[0] == '#,' and nb_tokens > 1: |
1332 elif tokens[0] == '#,': |
|
1333 if nb_tokens <= 1: |
|
1334 continue |
1177 # we are on a flags line |
1335 # we are on a flags line |
1178 self.process('FL', i) |
1336 self.process('fl') |
1179 |
1337 |
1180 elif tokens[0] == '#': |
1338 elif tokens[0] == '#' or tokens[0].startswith('##'): |
1181 if line == '#': line += ' ' |
1339 if line == '#': |
|
1340 line += ' ' |
1182 # we are on a translator comment line |
1341 # we are on a translator comment line |
1183 self.process('TC', i) |
1342 self.process('tc') |
1184 |
1343 |
1185 elif tokens[0] == '#.' and nb_tokens > 1: |
1344 elif tokens[0] == '#.': |
|
1345 if nb_tokens <= 1: |
|
1346 continue |
1186 # we are on a generated comment line |
1347 # we are on a generated comment line |
1187 self.process('GC', i) |
1348 self.process('gc') |
1188 |
1349 |
1189 elif tokens[0] == '#|': |
1350 elif tokens[0] == '#|': |
1190 if nb_tokens < 2: |
1351 if nb_tokens <= 1: |
1191 self.process('??', i) |
1352 raise IOError('Syntax error in po file %s (line %s)' % |
1192 continue |
1353 (self.instance.fpath, self.current_line)) |
1193 |
1354 |
1194 # Remove the marker and any whitespace right after that. |
1355 # Remove the marker and any whitespace right after that. |
1195 line = line[2:].lstrip() |
1356 line = line[2:].lstrip() |
1196 self.current_token = line |
1357 self.current_token = line |
1197 |
1358 |
1198 if tokens[1].startswith('"'): |
1359 if tokens[1].startswith('"'): |
1199 # Continuation of previous metadata. |
1360 # Continuation of previous metadata. |
1200 self.process('MC', i) |
1361 self.process('mc') |
1201 continue |
1362 continue |
1202 |
1363 |
1203 if nb_tokens == 2: |
1364 if nb_tokens == 2: |
1204 # Invalid continuation line. |
1365 # Invalid continuation line. |
1205 self.process('??', i) |
1366 raise IOError('Syntax error in po file %s (line %s): ' |
|
1367 'invalid continuation line' % |
|
1368 (self.instance.fpath, self.current_line)) |
1206 |
1369 |
1207 # we are on a "previous translation" comment line, |
1370 # we are on a "previous translation" comment line, |
1208 if tokens[1] not in prev_keywords: |
1371 if tokens[1] not in prev_keywords: |
1209 # Unknown keyword in previous translation comment. |
1372 # Unknown keyword in previous translation comment. |
1210 self.process('??', i) |
1373 raise IOError('Syntax error in po file %s (line %s): ' |
|
1374 'unknown keyword %s' % |
|
1375 (self.instance.fpath, self.current_line, |
|
1376 tokens[1])) |
1211 |
1377 |
1212 # Remove the keyword and any whitespace |
1378 # Remove the keyword and any whitespace |
1213 # between it and the starting quote. |
1379 # between it and the starting quote. |
1214 line = line[len(tokens[1]):].lstrip() |
1380 line = line[len(tokens[1]):].lstrip() |
1215 self.current_token = line |
1381 self.current_token = line |
1216 self.process(prev_keywords[tokens[1]], i) |
1382 self.process(prev_keywords[tokens[1]]) |
1217 |
1383 |
1218 else: |
1384 else: |
1219 self.process('??', i) |
1385 raise IOError('Syntax error in po file %s (line %s)' % |
1220 |
1386 (self.instance.fpath, self.current_line)) |
1221 if self.current_entry: |
1387 |
|
1388 if self.current_entry and len(tokens) > 0 and \ |
|
1389 not tokens[0].startswith('#'): |
1222 # since entries are added when another entry is found, we must add |
1390 # since entries are added when another entry is found, we must add |
1223 # the last entry here (only if there are lines) |
1391 # the last entry here (only if there are lines). Trailing comments |
|
1392 # are ignored |
1224 self.instance.append(self.current_entry) |
1393 self.instance.append(self.current_entry) |
|
1394 |
1225 # before returning the instance, check if there's metadata and if |
1395 # before returning the instance, check if there's metadata and if |
1226 # so extract it in a dict |
1396 # so extract it in a dict |
1227 firstentry = self.instance[0] |
1397 metadataentry = self.instance.find('') |
1228 if firstentry.msgid == '': # metadata found |
1398 if metadataentry: # metadata found |
1229 # remove the entry |
1399 # remove the entry |
1230 firstentry = self.instance.pop(0) |
1400 self.instance.remove(metadataentry) |
1231 self.instance.metadata_is_fuzzy = firstentry.flags |
1401 self.instance.metadata_is_fuzzy = metadataentry.flags |
1232 key = None |
1402 key = None |
1233 for msg in firstentry.msgstr.splitlines(): |
1403 for msg in metadataentry.msgstr.splitlines(): |
1234 try: |
1404 try: |
1235 key, val = msg.split(':', 1) |
1405 key, val = msg.split(':', 1) |
1236 self.instance.metadata[key] = val.strip() |
1406 self.instance.metadata[key] = val.strip() |
1237 except: |
1407 except (ValueError, KeyError): |
1238 if key is not None: |
1408 if key is not None: |
1239 self.instance.metadata[key] += '\n'+ msg.strip() |
1409 self.instance.metadata[key] += '\n' + msg.strip() |
1240 # close opened file |
1410 # close opened file |
1241 if isinstance(self.fhandle, file): |
1411 if not isinstance(self.fhandle, list): # must be file |
1242 self.fhandle.close() |
1412 self.fhandle.close() |
1243 return self.instance |
1413 return self.instance |
1244 |
1414 |
1245 def add(self, symbol, states, next_state): |
1415 def add(self, symbol, states, next_state): |
1246 """ |
1416 """ |
1290 self.instance.header += self.current_token[2:] |
1461 self.instance.header += self.current_token[2:] |
1291 return 1 |
1462 return 1 |
1292 |
1463 |
1293 def handle_tc(self): |
1464 def handle_tc(self): |
1294 """Handle a translator comment.""" |
1465 """Handle a translator comment.""" |
1295 if self.current_state in ['MC', 'MS', 'MX']: |
1466 if self.current_state in ['mc', 'ms', 'mx']: |
1296 self.instance.append(self.current_entry) |
1467 self.instance.append(self.current_entry) |
1297 self.current_entry = POEntry() |
1468 self.current_entry = POEntry(linenum=self.current_line) |
1298 if self.current_entry.tcomment != '': |
1469 if self.current_entry.tcomment != '': |
1299 self.current_entry.tcomment += '\n' |
1470 self.current_entry.tcomment += '\n' |
1300 self.current_entry.tcomment += self.current_token[2:] |
1471 tcomment = self.current_token.lstrip('#') |
|
1472 if tcomment.startswith(' '): |
|
1473 tcomment = tcomment[1:] |
|
1474 self.current_entry.tcomment += tcomment |
1301 return True |
1475 return True |
1302 |
1476 |
1303 def handle_gc(self): |
1477 def handle_gc(self): |
1304 """Handle a generated comment.""" |
1478 """Handle a generated comment.""" |
1305 if self.current_state in ['MC', 'MS', 'MX']: |
1479 if self.current_state in ['mc', 'ms', 'mx']: |
1306 self.instance.append(self.current_entry) |
1480 self.instance.append(self.current_entry) |
1307 self.current_entry = POEntry() |
1481 self.current_entry = POEntry(linenum=self.current_line) |
1308 if self.current_entry.comment != '': |
1482 if self.current_entry.comment != '': |
1309 self.current_entry.comment += '\n' |
1483 self.current_entry.comment += '\n' |
1310 self.current_entry.comment += self.current_token[3:] |
1484 self.current_entry.comment += self.current_token[3:] |
1311 return True |
1485 return True |
1312 |
1486 |
1313 def handle_oc(self): |
1487 def handle_oc(self): |
1314 """Handle a file:num occurence.""" |
1488 """Handle a file:num occurrence.""" |
1315 if self.current_state in ['MC', 'MS', 'MX']: |
1489 if self.current_state in ['mc', 'ms', 'mx']: |
1316 self.instance.append(self.current_entry) |
1490 self.instance.append(self.current_entry) |
1317 self.current_entry = POEntry() |
1491 self.current_entry = POEntry(linenum=self.current_line) |
1318 occurrences = self.current_token[3:].split() |
1492 occurrences = self.current_token[3:].split() |
1319 for occurrence in occurrences: |
1493 for occurrence in occurrences: |
1320 if occurrence != '': |
1494 if occurrence != '': |
1321 try: |
1495 try: |
1322 fil, line = occurrence.split(':') |
1496 fil, line = occurrence.split(':') |
1323 if not line.isdigit(): |
1497 if not line.isdigit(): |
1324 fil = fil + line |
1498 fil = fil + line |
1325 line = '' |
1499 line = '' |
1326 self.current_entry.occurrences.append((fil, line)) |
1500 self.current_entry.occurrences.append((fil, line)) |
1327 except: |
1501 except (ValueError, AttributeError): |
1328 self.current_entry.occurrences.append((occurrence, '')) |
1502 self.current_entry.occurrences.append((occurrence, '')) |
1329 return True |
1503 return True |
1330 |
1504 |
1331 def handle_fl(self): |
1505 def handle_fl(self): |
1332 """Handle a flags line.""" |
1506 """Handle a flags line.""" |
1333 if self.current_state in ['MC', 'MS', 'MX']: |
1507 if self.current_state in ['mc', 'ms', 'mx']: |
1334 self.instance.append(self.current_entry) |
1508 self.instance.append(self.current_entry) |
1335 self.current_entry = POEntry() |
1509 self.current_entry = POEntry(linenum=self.current_line) |
1336 self.current_entry.flags += self.current_token[3:].split(', ') |
1510 self.current_entry.flags += [c.strip() for c in |
|
1511 self.current_token[3:].split(',')] |
1337 return True |
1512 return True |
1338 |
1513 |
1339 def handle_pp(self): |
1514 def handle_pp(self): |
1340 """Handle a previous msgid_plural line.""" |
1515 """Handle a previous msgid_plural line.""" |
1341 if self.current_state in ['MC', 'MS', 'MX']: |
1516 if self.current_state in ['mc', 'ms', 'mx']: |
1342 self.instance.append(self.current_entry) |
1517 self.instance.append(self.current_entry) |
1343 self.current_entry = POEntry() |
1518 self.current_entry = POEntry(linenum=self.current_line) |
1344 self.current_entry.previous_msgid_plural = \ |
1519 self.current_entry.previous_msgid_plural = \ |
1345 unescape(self.current_token[1:-1]) |
1520 unescape(self.current_token[1:-1]) |
1346 return True |
1521 return True |
1347 |
1522 |
1348 def handle_pm(self): |
1523 def handle_pm(self): |
1349 """Handle a previous msgid line.""" |
1524 """Handle a previous msgid line.""" |
1350 if self.current_state in ['MC', 'MS', 'MX']: |
1525 if self.current_state in ['mc', 'ms', 'mx']: |
1351 self.instance.append(self.current_entry) |
1526 self.instance.append(self.current_entry) |
1352 self.current_entry = POEntry() |
1527 self.current_entry = POEntry(linenum=self.current_line) |
1353 self.current_entry.previous_msgid = \ |
1528 self.current_entry.previous_msgid = \ |
1354 unescape(self.current_token[1:-1]) |
1529 unescape(self.current_token[1:-1]) |
1355 return True |
1530 return True |
1356 |
1531 |
1357 def handle_pc(self): |
1532 def handle_pc(self): |
1358 """Handle a previous msgctxt line.""" |
1533 """Handle a previous msgctxt line.""" |
1359 if self.current_state in ['MC', 'MS', 'MX']: |
1534 if self.current_state in ['mc', 'ms', 'mx']: |
1360 self.instance.append(self.current_entry) |
1535 self.instance.append(self.current_entry) |
1361 self.current_entry = POEntry() |
1536 self.current_entry = POEntry(linenum=self.current_line) |
1362 self.current_entry.previous_msgctxt = \ |
1537 self.current_entry.previous_msgctxt = \ |
1363 unescape(self.current_token[1:-1]) |
1538 unescape(self.current_token[1:-1]) |
1364 return True |
1539 return True |
1365 |
1540 |
1366 def handle_ct(self): |
1541 def handle_ct(self): |
1367 """Handle a msgctxt.""" |
1542 """Handle a msgctxt.""" |
1368 if self.current_state in ['MC', 'MS', 'MX']: |
1543 if self.current_state in ['mc', 'ms', 'mx']: |
1369 self.instance.append(self.current_entry) |
1544 self.instance.append(self.current_entry) |
1370 self.current_entry = POEntry() |
1545 self.current_entry = POEntry(linenum=self.current_line) |
1371 self.current_entry.msgctxt = unescape(self.current_token[1:-1]) |
1546 self.current_entry.msgctxt = unescape(self.current_token[1:-1]) |
1372 return True |
1547 return True |
1373 |
1548 |
1374 def handle_mi(self): |
1549 def handle_mi(self): |
1375 """Handle a msgid.""" |
1550 """Handle a msgid.""" |
1376 if self.current_state in ['MC', 'MS', 'MX']: |
1551 if self.current_state in ['mc', 'ms', 'mx']: |
1377 self.instance.append(self.current_entry) |
1552 self.instance.append(self.current_entry) |
1378 self.current_entry = POEntry() |
1553 self.current_entry = POEntry(linenum=self.current_line) |
1379 self.current_entry.obsolete = self.entry_obsolete |
1554 self.current_entry.obsolete = self.entry_obsolete |
1380 self.current_entry.msgid = unescape(self.current_token[1:-1]) |
1555 self.current_entry.msgid = unescape(self.current_token[1:-1]) |
1381 return True |
1556 return True |
1382 |
1557 |
1383 def handle_mp(self): |
1558 def handle_mp(self): |
1390 self.current_entry.msgstr = unescape(self.current_token[1:-1]) |
1565 self.current_entry.msgstr = unescape(self.current_token[1:-1]) |
1391 return True |
1566 return True |
1392 |
1567 |
1393 def handle_mx(self): |
1568 def handle_mx(self): |
1394 """Handle a msgstr plural.""" |
1569 """Handle a msgstr plural.""" |
1395 index, value = self.current_token[7], self.current_token[11:-1] |
1570 index = self.current_token[7] |
1396 self.current_entry.msgstr_plural[index] = unescape(value) |
1571 value = self.current_token[self.current_token.find('"') + 1:-1] |
1397 self.msgstr_index = index |
1572 self.current_entry.msgstr_plural[int(index)] = unescape(value) |
|
1573 self.msgstr_index = int(index) |
1398 return True |
1574 return True |
1399 |
1575 |
1400 def handle_mc(self): |
1576 def handle_mc(self): |
1401 """Handle a msgid or msgstr continuation line.""" |
1577 """Handle a msgid or msgstr continuation line.""" |
1402 token = unescape(self.current_token[1:-1]) |
1578 token = unescape(self.current_token[1:-1]) |
1403 if self.current_state == 'CT': |
1579 if self.current_state == 'ct': |
1404 typ = 'msgctxt' |
|
1405 self.current_entry.msgctxt += token |
1580 self.current_entry.msgctxt += token |
1406 elif self.current_state == 'MI': |
1581 elif self.current_state == 'mi': |
1407 typ = 'msgid' |
|
1408 self.current_entry.msgid += token |
1582 self.current_entry.msgid += token |
1409 elif self.current_state == 'MP': |
1583 elif self.current_state == 'mp': |
1410 typ = 'msgid_plural' |
|
1411 self.current_entry.msgid_plural += token |
1584 self.current_entry.msgid_plural += token |
1412 elif self.current_state == 'MS': |
1585 elif self.current_state == 'ms': |
1413 typ = 'msgstr' |
|
1414 self.current_entry.msgstr += token |
1586 self.current_entry.msgstr += token |
1415 elif self.current_state == 'MX': |
1587 elif self.current_state == 'mx': |
1416 typ = 'msgstr[%s]' % self.msgstr_index |
|
1417 self.current_entry.msgstr_plural[self.msgstr_index] += token |
1588 self.current_entry.msgstr_plural[self.msgstr_index] += token |
1418 elif self.current_state == 'PP': |
1589 elif self.current_state == 'pp': |
1419 typ = 'previous_msgid_plural' |
|
1420 token = token[3:] |
|
1421 self.current_entry.previous_msgid_plural += token |
1590 self.current_entry.previous_msgid_plural += token |
1422 elif self.current_state == 'PM': |
1591 elif self.current_state == 'pm': |
1423 typ = 'previous_msgid' |
|
1424 token = token[3:] |
|
1425 self.current_entry.previous_msgid += token |
1592 self.current_entry.previous_msgid += token |
1426 elif self.current_state == 'PC': |
1593 elif self.current_state == 'pc': |
1427 typ = 'previous_msgctxt' |
|
1428 token = token[3:] |
|
1429 self.current_entry.previous_msgctxt += token |
1594 self.current_entry.previous_msgctxt += token |
1430 # don't change the current state |
1595 # don't change the current state |
1431 return False |
1596 return False |
1432 |
|
1433 # }}} |
1597 # }}} |
1434 # class _MOFileParser {{{ |
1598 # class _MOFileParser {{{ |
|
1599 |
1435 |
1600 |
1436 class _MOFileParser(object): |
1601 class _MOFileParser(object): |
1437 """ |
1602 """ |
1438 A class to parse binary mo files. |
1603 A class to parse binary mo files. |
1439 """ |
1604 """ |
1454 ``check_for_duplicates`` |
1619 ``check_for_duplicates`` |
1455 whether to check for duplicate entries when adding entries to the |
1620 whether to check for duplicate entries when adding entries to the |
1456 file (optional, default: ``False``). |
1621 file (optional, default: ``False``). |
1457 """ |
1622 """ |
1458 self.fhandle = open(mofile, 'rb') |
1623 self.fhandle = open(mofile, 'rb') |
1459 self.instance = MOFile( |
1624 |
|
1625 klass = kwargs.get('klass') |
|
1626 if klass is None: |
|
1627 klass = MOFile |
|
1628 self.instance = klass( |
1460 fpath=mofile, |
1629 fpath=mofile, |
1461 encoding=kwargs.get('encoding', default_encoding), |
1630 encoding=kwargs.get('encoding', default_encoding), |
1462 check_for_duplicates=kwargs.get('check_for_duplicates', False) |
1631 check_for_duplicates=kwargs.get('check_for_duplicates', False) |
1463 ) |
1632 ) |
1464 |
1633 |
|
1634 def __del__(self): |
|
1635 """ |
|
1636 Make sure the file is closed, this prevents warnings on unclosed file |
|
1637 when running tests with python >= 3.2. |
|
1638 """ |
|
1639 if self.fhandle: |
|
1640 self.fhandle.close() |
|
1641 |
1465 def parse(self): |
1642 def parse(self): |
1466 """ |
1643 """ |
1467 Build the instance with the file handle provided in the |
1644 Build the instance with the file handle provided in the |
1468 constructor. |
1645 constructor. |
1469 """ |
1646 """ |
1470 # parse magic number |
1647 # parse magic number |
1471 magic_number = self._readbinary('<I', 4) |
1648 magic_number = self._readbinary('<I', 4) |
1472 if magic_number == MOFile.LITTLE_ENDIAN: |
1649 if magic_number == MOFile.MAGIC: |
1473 ii = '<II' |
1650 ii = '<II' |
1474 elif magic_number == MOFile.BIG_ENDIAN: |
1651 elif magic_number == MOFile.MAGIC_SWAPPED: |
1475 ii = '>II' |
1652 ii = '>II' |
1476 else: |
1653 else: |
1477 raise IOError('Invalid mo file, magic number is incorrect !') |
1654 raise IOError('Invalid mo file, magic number is incorrect !') |
1478 self.instance.magic_number = magic_number |
1655 self.instance.magic_number = magic_number |
1479 # parse the version number and the number of strings |
1656 # parse the version number and the number of strings |
1480 self.instance.version, numofstrings = self._readbinary(ii, 8) |
1657 version, numofstrings = self._readbinary(ii, 8) |
|
1658 # from MO file format specs: "A program seeing an unexpected major |
|
1659 # revision number should stop reading the MO file entirely" |
|
1660 if version not in (0, 1): |
|
1661 raise IOError('Invalid mo file, unexpected major revision number') |
|
1662 self.instance.version = version |
1481 # original strings and translation strings hash table offset |
1663 # original strings and translation strings hash table offset |
1482 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) |
1664 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) |
1483 # move to msgid hash table and read length and offset of msgids |
1665 # move to msgid hash table and read length and offset of msgids |
1484 self.fhandle.seek(msgids_hash_offset) |
1666 self.fhandle.seek(msgids_hash_offset) |
1485 msgids_index = [] |
1667 msgids_index = [] |
1489 self.fhandle.seek(msgstrs_hash_offset) |
1671 self.fhandle.seek(msgstrs_hash_offset) |
1490 msgstrs_index = [] |
1672 msgstrs_index = [] |
1491 for i in range(numofstrings): |
1673 for i in range(numofstrings): |
1492 msgstrs_index.append(self._readbinary(ii, 8)) |
1674 msgstrs_index.append(self._readbinary(ii, 8)) |
1493 # build entries |
1675 # build entries |
|
1676 encoding = self.instance.encoding |
1494 for i in range(numofstrings): |
1677 for i in range(numofstrings): |
1495 self.fhandle.seek(msgids_index[i][1]) |
1678 self.fhandle.seek(msgids_index[i][1]) |
1496 msgid = self.fhandle.read(msgids_index[i][0]) |
1679 msgid = self.fhandle.read(msgids_index[i][0]) |
|
1680 |
1497 self.fhandle.seek(msgstrs_index[i][1]) |
1681 self.fhandle.seek(msgstrs_index[i][1]) |
1498 msgstr = self.fhandle.read(msgstrs_index[i][0]) |
1682 msgstr = self.fhandle.read(msgstrs_index[i][0]) |
1499 if i == 0: # metadata |
1683 if i == 0 and not msgid: # metadata |
1500 raw_metadata, metadata = msgstr.split('\n'), {} |
1684 raw_metadata, metadata = msgstr.split(b('\n')), {} |
1501 for line in raw_metadata: |
1685 for line in raw_metadata: |
1502 tokens = line.split(':', 1) |
1686 tokens = line.split(b(':'), 1) |
1503 if tokens[0] != '': |
1687 if tokens[0] != b(''): |
1504 try: |
1688 try: |
1505 metadata[tokens[0]] = tokens[1].strip() |
1689 k = tokens[0].decode(encoding) |
|
1690 v = tokens[1].decode(encoding) |
|
1691 metadata[k] = v.strip() |
1506 except IndexError: |
1692 except IndexError: |
1507 metadata[tokens[0]] = '' |
1693 metadata[k] = u('') |
1508 self.instance.metadata = metadata |
1694 self.instance.metadata = metadata |
1509 continue |
1695 continue |
1510 # test if we have a plural entry |
1696 # test if we have a plural entry |
1511 msgid_tokens = msgid.split('\0') |
1697 msgid_tokens = msgid.split(b('\0')) |
1512 if len(msgid_tokens) > 1: |
1698 if len(msgid_tokens) > 1: |
1513 entry = self._build_entry( |
1699 entry = self._build_entry( |
1514 msgid=msgid_tokens[0], |
1700 msgid=msgid_tokens[0], |
1515 msgid_plural=msgid_tokens[1], |
1701 msgid_plural=msgid_tokens[1], |
1516 msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0'))) |
1702 msgstr_plural=dict((k, v) for k, v in |
|
1703 enumerate(msgstr.split(b('\0')))) |
1517 ) |
1704 ) |
1518 else: |
1705 else: |
1519 entry = self._build_entry(msgid=msgid, msgstr=msgstr) |
1706 entry = self._build_entry(msgid=msgid, msgstr=msgstr) |
1520 self.instance.append(entry) |
1707 self.instance.append(entry) |
1521 # close opened file |
1708 # close opened file |
1522 self.fhandle.close() |
1709 self.fhandle.close() |
1523 return self.instance |
1710 return self.instance |
1524 |
1711 |
1525 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, |
1712 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, |
1526 msgstr_plural=None): |
1713 msgstr_plural=None): |
1527 msgctxt_msgid = msgid.split('\x04') |
1714 msgctxt_msgid = msgid.split(b('\x04')) |
|
1715 encoding = self.instance.encoding |
1528 if len(msgctxt_msgid) > 1: |
1716 if len(msgctxt_msgid) > 1: |
1529 kwargs = { |
1717 kwargs = { |
1530 'msgctxt': msgctxt_msgid[0], |
1718 'msgctxt': msgctxt_msgid[0].decode(encoding), |
1531 'msgid' : msgctxt_msgid[1], |
1719 'msgid': msgctxt_msgid[1].decode(encoding), |
1532 } |
1720 } |
1533 else: |
1721 else: |
1534 kwargs = {'msgid': msgid} |
1722 kwargs = {'msgid': msgid.decode(encoding)} |
1535 if msgstr: |
1723 if msgstr: |
1536 kwargs['msgstr'] = msgstr |
1724 kwargs['msgstr'] = msgstr.decode(encoding) |
1537 if msgid_plural: |
1725 if msgid_plural: |
1538 kwargs['msgid_plural'] = msgid_plural |
1726 kwargs['msgid_plural'] = msgid_plural.decode(encoding) |
1539 if msgstr_plural: |
1727 if msgstr_plural: |
|
1728 for k in msgstr_plural: |
|
1729 msgstr_plural[k] = msgstr_plural[k].decode(encoding) |
1540 kwargs['msgstr_plural'] = msgstr_plural |
1730 kwargs['msgstr_plural'] = msgstr_plural |
1541 return MOEntry(**kwargs) |
1731 return MOEntry(**kwargs) |
1542 |
1732 |
1543 def _readbinary(self, fmt, numbytes): |
1733 def _readbinary(self, fmt, numbytes): |
1544 """ |
1734 """ |
1548 bytes = self.fhandle.read(numbytes) |
1738 bytes = self.fhandle.read(numbytes) |
1549 tup = struct.unpack(fmt, bytes) |
1739 tup = struct.unpack(fmt, bytes) |
1550 if len(tup) == 1: |
1740 if len(tup) == 1: |
1551 return tup[0] |
1741 return tup[0] |
1552 return tup |
1742 return tup |
1553 |
|
1554 # }}} |
1743 # }}} |
|
1744 # class TextWrapper {{{ |
|
1745 |
|
1746 |
|
1747 class TextWrapper(textwrap.TextWrapper): |
|
1748 """ |
|
1749 Subclass of textwrap.TextWrapper that backport the |
|
1750 drop_whitespace option. |
|
1751 """ |
|
1752 def __init__(self, *args, **kwargs): |
|
1753 drop_whitespace = kwargs.pop('drop_whitespace', True) |
|
1754 textwrap.TextWrapper.__init__(self, *args, **kwargs) |
|
1755 self.drop_whitespace = drop_whitespace |
|
1756 |
|
1757 def _wrap_chunks(self, chunks): |
|
1758 """_wrap_chunks(chunks : [string]) -> [string] |
|
1759 |
|
1760 Wrap a sequence of text chunks and return a list of lines of |
|
1761 length 'self.width' or less. (If 'break_long_words' is false, |
|
1762 some lines may be longer than this.) Chunks correspond roughly |
|
1763 to words and the whitespace between them: each chunk is |
|
1764 indivisible (modulo 'break_long_words'), but a line break can |
|
1765 come between any two chunks. Chunks should not have internal |
|
1766 whitespace; ie. a chunk is either all whitespace or a "word". |
|
1767 Whitespace chunks will be removed from the beginning and end of |
|
1768 lines, but apart from that whitespace is preserved. |
|
1769 """ |
|
1770 lines = [] |
|
1771 if self.width <= 0: |
|
1772 raise ValueError("invalid width %r (must be > 0)" % self.width) |
|
1773 |
|
1774 # Arrange in reverse order so items can be efficiently popped |
|
1775 # from a stack of chucks. |
|
1776 chunks.reverse() |
|
1777 |
|
1778 while chunks: |
|
1779 |
|
1780 # Start the list of chunks that will make up the current line. |
|
1781 # cur_len is just the length of all the chunks in cur_line. |
|
1782 cur_line = [] |
|
1783 cur_len = 0 |
|
1784 |
|
1785 # Figure out which static string will prefix this line. |
|
1786 if lines: |
|
1787 indent = self.subsequent_indent |
|
1788 else: |
|
1789 indent = self.initial_indent |
|
1790 |
|
1791 # Maximum width for this line. |
|
1792 width = self.width - len(indent) |
|
1793 |
|
1794 # First chunk on line is whitespace -- drop it, unless this |
|
1795 # is the very beginning of the text (ie. no lines started yet). |
|
1796 if self.drop_whitespace and chunks[-1].strip() == '' and lines: |
|
1797 del chunks[-1] |
|
1798 |
|
1799 while chunks: |
|
1800 l = len(chunks[-1]) |
|
1801 |
|
1802 # Can at least squeeze this chunk onto the current line. |
|
1803 if cur_len + l <= width: |
|
1804 cur_line.append(chunks.pop()) |
|
1805 cur_len += l |
|
1806 |
|
1807 # Nope, this line is full. |
|
1808 else: |
|
1809 break |
|
1810 |
|
1811 # The current line is full, and the next chunk is too big to |
|
1812 # fit on *any* line (not just this one). |
|
1813 if chunks and len(chunks[-1]) > width: |
|
1814 self._handle_long_word(chunks, cur_line, cur_len, width) |
|
1815 |
|
1816 # If the last chunk on this line is all whitespace, drop it. |
|
1817 if self.drop_whitespace and cur_line and not cur_line[-1].strip(): |
|
1818 del cur_line[-1] |
|
1819 |
|
1820 # Convert current line back to a string and store it in list |
|
1821 # of all lines (return value). |
|
1822 if cur_line: |
|
1823 lines.append(indent + ''.join(cur_line)) |
|
1824 |
|
1825 return lines |
|
1826 # }}} |
|
1827 # function wrap() {{{ |
|
1828 |
|
1829 |
|
1830 def wrap(text, width=70, **kwargs): |
|
1831 """ |
|
1832 Wrap a single paragraph of text, returning a list of wrapped lines. |
|
1833 """ |
|
1834 if sys.version_info < (2, 6): |
|
1835 return TextWrapper(width=width, **kwargs).wrap(text) |
|
1836 return textwrap.wrap(text, width=width, **kwargs) |
|
1837 |
|
1838 # }}} |