Package translate :: Package lang :: Module common
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.common

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2007-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module contains all the common features for languages. 
 23   
 24     Supported features 
 25     ================== 
 26       - language code (km, af) 
 27       - language name (Khmer, Afrikaans) 
 28       - Plurals 
 29         - Number of plurals (nplurals) 
 30         - Plural equation 
 31       - pofilter tests to ignore 
 32      
 33     Segmentation 
 34     ------------ 
 35       - characters 
 36       - words 
 37       - sentences 
 38      
 39     TODOs and Ideas for possible features 
 40     ===================================== 
 41       - Language-Team information 
 42       - Segmentation 
 43         - phrases 
 44      
 45     Punctuation 
 46     ----------- 
 47       - End of sentence 
 48       - Start of sentence 
 49       - Middle of sentence 
 50       - Quotes 
 51         - single 
 52         - double 
 53      
 54       - Valid characters 
 55       - Accelerator characters 
 56       - Special characters 
 57       - Direction (rtl or ltr) 
 58  """ 
 59   
 60  from translate.lang import data 
 61  import re 
 62   
63 -class Common(object):
64 """This class is the common parent class for all language classes.""" 65 66 code = "" 67 """The ISO 639 language code, possibly with a country specifier or other 68 modifier. 69 70 Examples:: 71 km 72 pt_BR 73 sr_YU@Latn 74 """ 75 76 fullname = "" 77 """The full (English) name of this language. 78 79 Dialect codes should have the form of 80 - Khmer 81 - Portugese (Brazil) 82 - TODO: sr_YU@Latn? 83 """ 84 85 nplurals = 0 86 """The number of plural forms of this language. 87 88 0 is not a valid value - it must be overridden. 89 Any positive integer is valid (it should probably be between 1 and 6) 90 @see: L{data} 91 """ 92 93 pluralequation = "0" 94 """The plural equation for selection of plural forms. 95 96 This is used for PO files to fill into the header. 97 @see: U{Gettext manual<http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>} 98 @see: L{data} 99 """ 100 # Don't change these defaults of nplurals or pluralequation willy-nilly: 101 # some code probably depends on these for unrecognised languages 102 103 listseperator = u", " 104 """This string is used to separate lists of textual elements. Most 105 languages probably can stick with the default comma, but Arabic and some 106 Asian languages might want to override this.""" 107 108 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>" 109 """These punctuation marks are common in English and most languages that 110 use latin script.""" 111 112 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»" 113 """These are different quotation marks used by various languages.""" 114 115 invertedpunc = u"¿¡" 116 """Inveted punctuation sometimes used at the beginning of sentences in 117 Spanish, Asturian, Galician, and Catalan.""" 118 119 rtlpunc = u"،؟؛÷" 120 """These punctuation marks are used by Arabic and Persian, for example.""" 121 122 CJKpunc = u"。、,;!?「」『』【】" 123 """These punctuation marks are used in certain circumstances with CJK 124 languages.""" 125 126 indicpunc = u"।॥॰" 127 """These punctuation marks are used by several Indic languages.""" 128 129 ethiopicpunc = u"።፤፣" 130 """These punctuation marks are used by several Ethiopic languages.""" 131 132 miscpunc = u"…±°¹²³·©®×£¥€" 133 """The middle dot (·) is used by Greek and Georgian.""" 134 135 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\ 136 indicpunc, ethiopicpunc, miscpunc]) 137 """We include many types of punctuation here, simply since this is only 138 meant to determine if something is punctuation. Hopefully we catch some 139 languages which might not be represented with modules. Most languages won't 140 need to override this.""" 141 142 sentenceend = u".!?…։؟।。!?።" 143 """These marks can indicate a sentence end. Once again we try to account 144 for many languages. Most langauges won't need to override this.""" 145 146 #The following tries to account for a lot of things. For the best idea of 147 #what works, see test_common.py. We try to ignore abbreviations, for 148 #example, by checking that the following sentence doesn't start with lower 149 #case or numbers. 150 sentencere = re.compile(r"""(?s) #make . also match newlines 151 .*? #anything, but match non-greedy 152 [%s] #the puntuation for sentence ending 153 \s+ #the spacing after the puntuation 154 (?=[^a-z\d])#lookahead that next part starts with caps 155 """ % sentenceend, re.VERBOSE) 156 157 puncdict = {} 158 """A dictionary of punctuation transformation rules that can be used by 159 punctranslate().""" 160 161 ignoretests = [] 162 """List of pofilter tests for this language that must be ignored.""" 163 164 checker = None 165 """A language specific checker (see filters.checks). 166 167 This doesn't need to be supplied, but will be used if it exists.""" 168 169 _languages = {} 170 171 validaccel = None 172 """Characters that can be used as accelerators (access keys) i.e. Alt+X 173 where X is the accelerator. These can include combining diacritics as 174 long as they are accessible from the users keyboard in a single keystroke, 175 but normally they would be at least precomposed characters. All characters, 176 lower and upper, are included in the list.""" 177 178 validdoublewords = [] 179 """Some languages allow double words in certain cases. This is a dictionary 180 of such words.""" 181
182 - def __new__(cls, code):
183 """This returns the language class for the given code, following a 184 singleton like approach (only one object per language).""" 185 code = code or "" 186 # First see if a language object for this code already exists 187 if code in cls._languages: 188 return cls._languages[code] 189 # No existing language. Let's build a new one and keep a copy 190 language = cls._languages[code] = object.__new__(cls) 191 192 language.code = code 193 while code: 194 langdata = data.languages.get(code, None) 195 if langdata: 196 language.fullname, language.nplurals, language.pluralequation = langdata 197 break 198 code = data.simplercode(code) 199 if not code: 200 # print >> sys.stderr, "Warning: No information found about language code %s" % code 201 pass 202 return language
203
204 - def __deepcopy__(self, memo={}):
205 memo[id(self)] = self 206 return self
207
208 - def __repr__(self):
209 """Give a simple string representation without address information to 210 be able to store it in text for comparison later.""" 211 detail = "" 212 if self.code: 213 detail = "(%s)" % self.code 214 return "<class 'translate.lang.common.Common%s'>" % detail
215
216 - def punctranslate(cls, text):
217 """Converts the punctuation in a string according to the rules of the 218 language.""" 219 # TODO: look at po::escapeforpo() for performance idea 220 if not text: 221 return text 222 ellipses_end = text.endswith(u"...") 223 if ellipses_end: 224 text = text[:-3] 225 for source, target in cls.puncdict.iteritems(): 226 text = text.replace(source, target) 227 if ellipses_end: 228 if u"..." in cls.puncdict: 229 text += cls.puncdict[u"..."] 230 else: 231 text += u"..." 232 # Let's account for cases where a punctuation symbol plus a space is 233 # replaced, but the space won't exist at the end of a message. 234 # As a simple improvement for messages ending in ellipses (...), we 235 # test that the last character is different from the second last 236 # This is only relevant if the string has two characters or more 237 if (text[-1] + u" " in cls.puncdict) and (len(text) < 2 or text[-2] != text[-1]): 238 text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip() 239 return text
240 punctranslate = classmethod(punctranslate) 241
242 - def length_difference(cls, length):
243 """Returns an estimate to a likely change in length relative to an 244 English string of length length.""" 245 # This is just a rudimentary heuristic guessing that most translations 246 # will be somewhat longer than the source language 247 expansion_factor = 0 248 code = cls.code 249 while code: 250 expansion_factor = data.expansion_factors.get(cls.code, 0) 251 if expansion_factor: 252 break 253 code = data.simplercode(code) 254 else: 255 expansion_factor = 0.1 # default 256 constant = max(5, int(40*expansion_factor)) 257 # The default: return 5 + length/10 258 return constant + int(expansion_factor * length)
259 length_difference = classmethod(length_difference) 260
261 - def alter_length(cls, text):
262 """Converts the given string by adding or removing characters as an 263 estimation of translation length (with English assumed as source 264 language).""" 265 def alter_it(text): 266 l = len(text) 267 if l > 9: 268 extra = cls.length_difference(l) 269 if extra > 0: 270 text = text[:extra].replace(u'\n', u'') + text 271 else: 272 text = text[-extra:] 273 return text
274 expanded = [] 275 for subtext in text.split(u"\n\n"): 276 expanded.append(alter_it(subtext)) 277 text = u"\n\n".join(expanded) 278 return text
279 alter_length = classmethod(alter_length) 280
281 - def character_iter(cls, text):
282 """Returns an iterator over the characters in text.""" 283 #We don't return more than one consecutive whitespace character 284 prev = 'A' 285 for c in text: 286 if c.isspace() and prev.isspace(): 287 continue 288 prev = c 289 if not (c in cls.punctuation): 290 yield c
291 character_iter = classmethod(character_iter) 292
293 - def characters(cls, text):
294 """Returns a list of characters in text.""" 295 return [c for c in cls.character_iter(text)]
296 characters = classmethod(characters) 297
298 - def word_iter(cls, text):
299 """Returns an iterator over the words in text.""" 300 #TODO: Consider replacing puctuation with space before split() 301 for w in text.split(): 302 word = w.strip(cls.punctuation) 303 if word: 304 yield word
305 word_iter = classmethod(word_iter) 306
307 - def words(cls, text):
308 """Returns a list of words in text.""" 309 return [w for w in cls.word_iter(text)]
310 words = classmethod(words) 311
312 - def sentence_iter(cls, text, strip=True):
313 """Returns an iterator over the sentences in text.""" 314 lastmatch = 0 315 text = text or "" 316 for item in cls.sentencere.finditer(text): 317 lastmatch = item.end() 318 sentence = item.group() 319 if strip: 320 sentence = sentence.strip() 321 if sentence: 322 yield sentence 323 remainder = text[lastmatch:] 324 if strip: 325 remainder = remainder.strip() 326 if remainder: 327 yield remainder
328 sentence_iter = classmethod(sentence_iter) 329
330 - def sentences(cls, text, strip=True):
331 """Returns a list of senteces in text.""" 332 return [s for s in cls.sentence_iter(text, strip=strip)]
333 sentences = classmethod(sentences) 334
335 - def capsstart(cls, text):
336 """Determines whether the text starts with a capital letter.""" 337 stripped = text.lstrip().lstrip(cls.punctuation) 338 return stripped and stripped[0].isupper()
339 capsstart = classmethod(capsstart) 340