1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """Module for parsing Gettext .mo files for translation.
32
33 The coding of .mo files was produced from U{Gettext documentation
34 <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>},
35 Pythons msgfmt.py and by observing and testing existing .mo files in the wild.
36
37 The hash algorithm is implemented for MO files, this should result in
38 faster access of the MO file. The hash is optional for Gettext
39 and is not needed for reading or writing MO files, in this implementation
40 it is always on and does produce sometimes different results to Gettext
41 in very small files.
42 """
43
44 import struct
45 import array
46 import re
47
48 from translate.storage import base
49 from translate.storage import po
50 from translate.storage import poheader
51 from translate.misc.multistring import multistring
52
53 MO_MAGIC_NUMBER = 0x950412deL
54
55
57 """Helper to unpack Gettext MO files into a Python string"""
58 f = open(filename)
59 s = f.read()
60 print "\\x%02x" * len(s) % tuple(map(ord, s))
61 f.close()
62
63
65 c0 = (result >> 0) & 0xff
66 c1 = (result >> 8) & 0xff
67 c2 = (result >> 16) & 0xff
68 c3 = (result >> 24) & 0xff
69
70 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
71
72
74 HASHWORDBITS = 32
75 hval = 0
76 g = None
77 s = str_param
78 for s in str_param:
79 hval = hval << 4
80 hval += ord(s)
81 g = hval & 0xf << (HASHWORDBITS - 4)
82 if (g != 0):
83 hval = hval ^ g >> (HASHWORDBITS - 8)
84 hval = hval ^ g
85 return hval
86
87
89
90
91 def is_prime(num):
92
93 if (num < 2) or (num == 4):
94 return False
95 if (num == 2) or (num == 3):
96 return True
97
98 for divider in range(2, num / 2):
99 if num % divider == 0:
100 return False
101 return True
102
103 candidate = start
104 while not is_prime(candidate):
105 candidate += 1
106 return candidate
107
108
109 -class mounit(base.TranslationUnit):
110 """A class representing a .mo translation message."""
111
112 - def __init__(self, source=None, encoding=None):
117
118 - def getcontext(self):
119 """Get the message context"""
120
121 if self.msgctxt is None:
122 return None
123 return "".join(self.msgctxt)
124
126 """Is this a header entry?"""
127 return self.source == u""
128
130 """Is this message translateable?"""
131 return bool(self.source)
132
133
134 -class mofile(poheader.poheader, base.TranslationStore):
135 """A class representing a .mo file."""
136 UnitClass = mounit
137 Name = _("Gettext MO file")
138 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"]
139 Extensions = ["mo", "gmo"]
140 _binary = True
141
143 self.UnitClass = unitclass
144 base.TranslationStore.__init__(self, unitclass=unitclass)
145 self.filename = ''
146 self._encoding = "UTF-8"
147 if inputfile is not None:
148 self.parsestring(inputfile)
149
151 """Output a string representation of the MO data file"""
152
153
154 def add_to_hash_table(string, i):
155 V = hashpjw(string)
156
157 S = hash_size <= 2 and 3 or hash_size
158 hash_cursor = V % S
159 orig_hash_cursor = hash_cursor
160 increment = 1 + (V % (S - 2))
161 while True:
162 index = hash_table[hash_cursor]
163 if (index == 0):
164 hash_table[hash_cursor] = i + 1
165 break
166 hash_cursor += increment
167 hash_cursor = hash_cursor % S
168 assert (hash_cursor != orig_hash_cursor)
169
170
171
172
173 hash_size = get_next_prime_number(int((len(self.units) * 4) / 3))
174 if hash_size <= 2:
175 hash_size = 3
176 MESSAGES = {}
177 for unit in self.units:
178 if isinstance(unit.source, multistring):
179 source = "".join(unit.msgidcomments) + \
180 "\0".join(unit.source.strings)
181 else:
182 source = "".join(unit.msgidcomments) + unit.source
183 if unit.msgctxt:
184 source = "".join(unit.msgctxt) + "\x04" + source
185 if isinstance(unit.target, multistring):
186 target = "\0".join(unit.target.strings)
187 else:
188 target = unit.target
189 if unit.target:
190 MESSAGES[source.encode("utf-8")] = target
191
192 hash_table = array.array("I", [0] * hash_size)
193 keys = MESSAGES.keys()
194
195 keys.sort()
196 offsets = []
197 ids = strs = ''
198 for i, id in enumerate(keys):
199
200
201
202 add_to_hash_table(id, i)
203 string = MESSAGES[id]
204 if isinstance(string, unicode):
205 string = string.encode('utf-8')
206 offsets.append((len(ids), len(id), len(strs), len(string)))
207 ids = ids + id + '\0'
208 strs = strs + string + '\0'
209 output = ''
210
211 keystart = 7 * 4 + 16 * len(keys) + hash_size * 4
212
213 valuestart = keystart + len(ids)
214 koffsets = []
215 voffsets = []
216
217
218 for o1, l1, o2, l2 in offsets:
219 koffsets = koffsets + [l1, o1 + keystart]
220 voffsets = voffsets + [l2, o2 + valuestart]
221 offsets = koffsets + voffsets
222 output = struct.pack("Iiiiiii",
223 MO_MAGIC_NUMBER,
224 0,
225 len(keys),
226 7 * 4,
227 7 * 4 + len(keys) * 8,
228 hash_size,
229 7 * 4 + 2 * (len(keys) * 8))
230
231 if (len(keys) > 0):
232 output = output + array.array("i", offsets).tostring()
233 output = output + hash_table.tostring()
234 output = output + ids
235 output = output + strs
236 return output
237
239 """parses the given file or file source string"""
240 if hasattr(input, 'name'):
241 self.filename = input.name
242 elif not getattr(self, 'filename', ''):
243 self.filename = ''
244 if hasattr(input, "read"):
245 mosrc = input.read()
246 input.close()
247 input = mosrc
248 little, = struct.unpack("<L", input[:4])
249 big, = struct.unpack(">L", input[:4])
250 if little == MO_MAGIC_NUMBER:
251 endian = "<"
252 elif big == MO_MAGIC_NUMBER:
253 endian = ">"
254 else:
255 raise ValueError("This is not an MO file")
256 magic, version, lenkeys, startkey, \
257 startvalue, sizehash, offsethash = struct.unpack("%sLiiiiii" % endian,
258 input[:(7 * 4)])
259 if version > 1:
260 raise ValueError("Unable to process MO files with versions > 1. \
261 This is a %d version MO file" % version)
262 for i in range(lenkeys):
263 nextkey = startkey + (i * 2 * 4)
264 nextvalue = startvalue + (i * 2 * 4)
265 klength, koffset = struct.unpack("%sii" % endian,
266 input[nextkey:nextkey + (2 * 4)])
267 vlength, voffset = struct.unpack("%sii" % endian,
268 input[nextvalue:nextvalue + (2 * 4)])
269 source = input[koffset:koffset + klength]
270 context = None
271 if "\x04" in source:
272 context, source = source.split("\x04")
273
274 source = multistring(source.split("\0"), encoding=self._encoding)
275 if source == "":
276 charset = re.search("charset=([^\\s]+)",
277 input[voffset:voffset + vlength])
278 if charset:
279 self._encoding = po.encodingToUse(charset.group(1))
280 target = multistring(input[voffset:voffset + vlength].split("\0"),
281 encoding=self._encoding)
282 newunit = mounit(source)
283 newunit.settarget(target)
284 if context is not None:
285 newunit.msgctxt.append(context)
286 self.addunit(newunit)
287