1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """Class to perform translation memory matching from a store of translation units"""
23
24 from translate.search import lshtein
25 from translate.search import terminology
26 from translate.storage import base
27 from translate.storage import po
28 from translate.misc.multistring import multistring
29 import heapq
30
32 """Returns the length of the source string"""
33 return len(unit.source)
34
36 """Compares using sourcelen"""
37
38 xlen = sourcelen(x)
39 ylen = sourcelen(y)
40 return cmp(xlen, ylen)
41
43 """A class that will do matching and store configuration for the matching process"""
44 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
45 """max_candidates is the maximum number of candidates that should be assembled,
46 min_similarity is the minimum similarity that must be attained to be included in
47 the result, comparer is an optional Comparer with similarity() function"""
48 if comparer is None:
49 comparer = lshtein.LevenshteinComparer(max_length)
50 self.comparer = comparer
51 self.setparameters(max_candidates, min_similarity, max_length)
52 self.usefuzzy = usefuzzy
53 self.inittm(store)
54 self.addpercentage = True
55
68
70 """Initialises the memory for later use. We use simple base units for
71 speedup."""
72 self.existingunits = {}
73 self.candidates = base.TranslationStore()
74
75 if not isinstance(stores, list):
76 stores = [stores]
77 for store in stores:
78 self.extendtm(store.units, store=store, sort=False)
79 self.candidates.units.sort(sourcelencmp)
80
81
82
83 - def extendtm(self, units, store=None, sort=True):
84 """Extends the memory with extra unit(s).
85
86 @param units: The units to add to the TM.
87 @param store: Optional store from where some metadata can be retrieved
88 and associated with each unit.
89 @param sort: Optional parameter that can be set to False to supress
90 sorting of the candidates list. This should probably only be used in
91 inittm().
92 """
93 if not isinstance(units, list):
94 units = [units]
95 candidates = filter(self.usable, units)
96 for candidate in candidates:
97 simpleunit = base.TranslationUnit("")
98
99
100 if isinstance(candidate.source, multistring):
101 if len(candidate.source.strings) > 1:
102 simpleunit.orig_source = candidate.source
103 simpleunit.orig_target = candidate.target
104 simpleunit.source = unicode(candidate.source)
105 simpleunit.target = unicode(candidate.target)
106 else:
107 simpleunit.source = candidate.source
108 simpleunit.target = candidate.target
109
110
111
112
113 simpleunit.addnote(candidate.getnotes(origin="translator"))
114 simpleunit.fuzzy = candidate.isfuzzy()
115 if store:
116 simpleunit.filepath = store.filepath
117 simpleunit.translator = store.translator
118 simpleunit.date = store.date
119 self.candidates.units.append(simpleunit)
120 if sort:
121 self.candidates.units.sort(sourcelencmp)
122
123 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
124 """Sets the parameters without reinitialising the tm. If a parameter
125 is not specified, it is set to the default, not ignored"""
126 self.MAX_CANDIDATES = max_candidates
127 self.MIN_SIMILARITY = min_similarity
128 self.MAX_LENGTH = max_length
129
131 """Calculates a length beyond which we are not interested.
132 The extra fat is because we don't use plain character distance only."""
133 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
134
136 """Calculates the minimum length we are interested in.
137 The extra fat is because we don't use plain character distance only."""
138 return max(len(text) * (min_similarity/100.0), 1)
139
141 """Returns a list of possible matches for given source text.
142
143 @type text: String
144 @param text: The text that will be search for in the translation memory
145 @rtype: list
146 @return: a list of units with the source and target strings from the
147 translation memory. If self.addpercentage is true (default) the match
148 quality is given as a percentage in the notes.
149 """
150 bestcandidates = [(0.0,None)]*self.MAX_CANDIDATES
151 heapq.heapify(bestcandidates)
152
153
154 min_similarity = self.MIN_SIMILARITY
155
156
157
158
159
160 startlength = self.getstartlength(min_similarity, text)
161 startindex = 0
162 for index, candidate in enumerate(self.candidates.units):
163 if len(candidate.source) >= startlength:
164 startindex = index
165 break
166
167
168 stoplength = self.getstoplength(min_similarity, text)
169
170 for candidate in self.candidates.units[startindex:]:
171 cmpstring = candidate.source
172 if len(cmpstring) > stoplength:
173 break
174 similarity = self.comparer.similarity(text, cmpstring, min_similarity)
175 if similarity < min_similarity:
176 continue
177 lowestscore = bestcandidates[0][0]
178 if similarity > lowestscore:
179 targetstring = candidate.target
180 heapq.heapreplace(bestcandidates, (similarity, candidate))
181 if min_similarity < bestcandidates[0][0]:
182 min_similarity = bestcandidates[0][0]
183 stoplength = self.getstoplength(min_similarity, text)
184
185
186 def notzero(item):
187 score = item[0]
188 return score != 0
189 bestcandidates = filter(notzero, bestcandidates)
190
191 bestcandidates.sort()
192 bestcandidates.reverse()
193 return self.buildunits(bestcandidates)
194
196 """Builds a list of units conforming to base API, with the score in the comment"""
197 units = []
198 for score, candidate in candidates:
199 if hasattr(candidate, "orig_source"):
200 candidate.source = candidate.orig_source
201 candidate.target = candidate.orig_target
202 newunit = po.pounit(candidate.source)
203 newunit.target = candidate.target
204 newunit.markfuzzy(candidate.fuzzy)
205 newunit.filepath = candidate.filepath
206 newunit.translator = candidate.translator
207 newunit.date = candidate.date
208 candidatenotes = candidate.getnotes().strip()
209 if candidatenotes:
210 newunit.addnote(candidatenotes)
211 if self.addpercentage:
212 newunit.addnote("%d%%" % score)
213 units.append(newunit)
214 return units
215
217 """A matcher with settings specifically for terminology matching"""
218 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
219 if comparer is None:
220 comparer = terminology.TerminologyComparer(max_length)
221 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
222 self.addpercentage = False
223
225 """Normal initialisation, but convert all source strings to lower case"""
226 matcher.inittm(self, store)
227 for unit in self.candidates.units:
228 unit.source = unit.source.lower()
229
234
239
241 """Normal matching after converting text to lower case. Then replace
242 with the original unit to retain comments, etc."""
243 text = text.lower()
244 matches = matcher.matches(self, text)
245 return matches
246