Package translate :: Package lang :: Module common
[hide private]
[frames] | no frames]

Source Code for Module translate.lang.common

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """This module contains all the common features for languages. 
 23   
 24  Supported features: 
 25  language code (km, af) 
 26  language name (Khmer, Afrikaans) 
 27  Plurals 
 28    Number of plurals (nplurals) 
 29    Plural equation 
 30  pofilter tests to ignore 
 31   
 32  Segmentation 
 33    characters 
 34    words 
 35    sentences 
 36   
 37  TODO: 
 38  Ideas for possible features: 
 39   
 40  Language-Team information 
 41   
 42  Segmentation 
 43    phrases 
 44   
 45  Punctuation 
 46    End of sentence 
 47    Start of sentence 
 48    Middle of sentence 
 49    Quotes 
 50      single 
 51      double 
 52   
 53  Valid characters 
 54  Accelerator characters 
 55  Special characters 
 56  Direction (rtl or ltr) 
 57  """ 
 58   
 59  from translate.lang import data 
 60  import re 
 61   
62 -class Common(object):
63 """This class is the common parent class for all language classes.""" 64 65 code = "" 66 """The ISO 639 language code, possibly with a country specifier or other 67 modifier. 68 69 Examples: 70 km 71 pt_BR 72 sr_YU@Latn 73 """ 74 75 fullname = "" 76 """The full (English) name of this language. 77 78 Dialect codes should have the form of 79 Khmer 80 Portugese (Brazil) 81 #TODO: sr_YU@Latn? 82 """ 83 84 nplurals = 0 85 """The number of plural forms of this language. 86 87 0 is not a valid value - it must be overridden. 88 Any positive integer is valid (it should probably be between 1 and 6) 89 Also see data.py 90 """ 91 92 pluralequation = "0" 93 """The plural equation for selection of plural forms. 94 95 This is used for PO files to fill into the header. 96 See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}. 97 Also see data.py 98 """ 99 100 listseperator = u", " 101 """This string is used to seperate lists of textual elements. Most 102 languages probably can stick with the default comma, but Arabic and some 103 Asian languages might want to override this.""" 104 105 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>" 106 """These punctuation marks are common in English and most languages that 107 use latin script.""" 108 109 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»" 110 """These are different quotation marks used by various languages.""" 111 112 invertedpunc = u"¿¡" 113 """Inveted punctuation sometimes used at the beginning of sentences in 114 Spanish, Asturian, Galician, and Catalan.""" 115 116 rtlpunc = u"،؟؛÷" 117 """These punctuation marks are used by Arabic and Persian, for example.""" 118 119 CJKpunc = u"。、,;!?「」『』【】" 120 """These punctuation marks are used in certain circumstances with CJK 121 languages.""" 122 123 indicpunc = u"।॥॰" 124 """These punctuation marks are used by several Indic languages.""" 125 126 ethiopicpunc = u"።፤፣" 127 """These punctuation marks are used by several Ethiopic languages.""" 128 129 miscpunc = u"…±°¹²³·©®×£¥€" 130 """The middle dot (·) is used by Greek and Georgian.""" 131 132 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\ 133 indicpunc, ethiopicpunc, miscpunc]) 134 """We include many types of punctuation here, simply since this is only 135 meant to determine if something is punctuation. Hopefully we catch some 136 languages which might not be represented with modules. Most languages won't 137 need to override this.""" 138 139 sentenceend = u".!?…։؟।。!?።" 140 """These marks can indicate a sentence end. Once again we try to account 141 for many languages. Most langauges won't need to override this.""" 142 143 #The following tries to account for a lot of things. For the best idea of 144 #what works, see test_common.py. We try to ignore abbreviations, for 145 #example, by checking that the following sentence doesn't start with lower 146 #case or numbers. 147 sentencere = re.compile(r"""(?s) #make . also match newlines 148 .*? #anything, but match non-greedy 149 [%s] #the puntuation for sentence ending 150 \s+ #the spacing after the puntuation 151 (?=[^a-z\d])#lookahead that next part starts with caps 152 """ % sentenceend, re.VERBOSE) 153 154 puncdict = {} 155 """A dictionary of punctuation transformation rules that can be used by punctranslate().""" 156 157 ignoretests = [] 158 """List of pofilter tests for this language that must be ignored.""" 159 160 checker = None 161 """A language specific checker (see filters.checks). 162 163 This doesn't need to be supplied, but will be used if it exists.""" 164
165 - def __init__(self, code):
166 """This constructor is used if we need to instantiate an abject (not 167 the usual setup). This will mostly when the factory is asked for a 168 language for which we don't have a dedicated class.""" 169 self.code = code or "" 170 while code: 171 langdata = data.languages.get(code, None) 172 if langdata: 173 self.fullname, self.nplurals, self.pluralequation = langdata 174 break 175 code = data.simplercode(code) 176 if not code: 177 # print >> sys.stderr, "Warning: No information found about language code %s" % code 178 pass
179
180 - def __repr__(self):
181 """Give a simple string representation without address information to 182 be able to store it in text for comparison later.""" 183 detail = "" 184 if self.code: 185 detail = "(%s)" % self.code 186 return "<class 'translate.lang.common.Common%s'>" % detail
187
188 - def punctranslate(cls, text):
189 """Converts the punctuation in a string according to the rules of the 190 language.""" 191 # TODO: look at po::escapeforpo() for performance idea 192 for source, target in cls.puncdict.iteritems(): 193 text = text.replace(source, target) 194 # Let's account for cases where a punctuation symbol plus a space is 195 # replaced, but the space won't exist at the end of a message 196 if text and text[-1] + " " in cls.puncdict: 197 text = text[:-1] + cls.puncdict[text[-1] + " "] 198 return text
199 punctranslate = classmethod(punctranslate) 200
201 - def character_iter(cls, text):
202 """Returns an iterator over the characters in text.""" 203 #We don't return more than one consecutive whitespace character 204 prev = 'A' 205 for c in text: 206 if c.isspace() and prev.isspace(): 207 continue 208 prev = c 209 if not (c in cls.punctuation): 210 yield c
211 character_iter = classmethod(character_iter) 212
213 - def characters(cls, text):
214 """Returns a list of characters in text.""" 215 return [c for c in cls.character_iter(text)]
216 characters = classmethod(characters) 217
218 - def word_iter(cls, text):
219 """Returns an iterator over the words in text.""" 220 #TODO: Consider replacing puctuation with space before split() 221 for w in text.split(): 222 word = w.strip(cls.punctuation) 223 if word: 224 yield word
225 word_iter = classmethod(word_iter) 226
227 - def words(cls, text):
228 """Returns a list of words in text.""" 229 return [w for w in cls.word_iter(text)]
230 words = classmethod(words) 231
232 - def sentence_iter(cls, text, strip=True):
233 """Returns an iterator over the sentences in text.""" 234 lastmatch = 0 235 iter = cls.sentencere.finditer(text) 236 for item in iter: 237 lastmatch = item.end() 238 sentence = item.group() 239 if strip: sentence = sentence.strip() 240 if sentence: yield sentence 241 remainder = text[lastmatch:] 242 if strip: remainder = remainder.strip() 243 if remainder: yield remainder
244 sentence_iter = classmethod(sentence_iter) 245
246 - def sentences(cls, text, strip=True):
247 """Returns a list of senteces in text.""" 248 return [s for s in cls.sentence_iter(text, strip=strip)]
249 sentences = classmethod(sentences) 250
251 - def capsstart(cls, text):
252 """Determines whether the text starts with a capital letter.""" 253 stripped = text.lstrip().lstrip(cls.punctuation) 254 return stripped and stripped[0].isupper()
255 capsstart = classmethod(capsstart)
256