Package translate :: Package storage :: Module html
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.html

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2004-2006,2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21  # 
 22   
 23  """module for parsing html files for translation""" 
 24   
 25  import re 
 26  from translate.storage import base 
 27  from HTMLParser import HTMLParser 
 28   
29 -class htmlunit(base.TranslationUnit):
30 """A unit of translatable/localisable HTML content"""
31 - def __init__(self, source=None):
32 self.locations = [] 33 self.setsource(source)
34
35 - def getsource(self):
36 #TODO: Rethink how clever we should try to be with html entities. 37 return self.text.replace("&amp;", "&").replace("&lt;", "<").replace("\r\n", " ").replace("\n", " ").replace("\r", " ")
38
39 - def setsource(self, source):
40 self._rich_source = None 41 self.text = source.replace("&", "&amp;").replace("<", "&lt;")
42 source = property(getsource, setsource) 43
44 - def addlocation(self, location):
45 self.locations.append(location)
46
47 - def getlocations(self):
48 return self.locations
49 50
51 -class htmlfile(HTMLParser, base.TranslationStore):
52 UnitClass = htmlunit 53 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"] 54 markingattrs = [] 55 includeattrs = ["alt", "summary", "standby", "abbr", "content"] 56
57 - def __init__(self, includeuntaggeddata=None, inputfile=None):
58 self.units = [] 59 self.filename = getattr(inputfile, 'name', None) 60 self.currentblock = "" 61 self.currentblocknum = 0 62 self.currentcomment = "" 63 self.currenttag = None 64 self.includeuntaggeddata = includeuntaggeddata 65 HTMLParser.__init__(self) 66 67 if inputfile is not None: 68 htmlsrc = inputfile.read() 69 inputfile.close() 70 self.parse(htmlsrc)
71
72 - def guess_encoding(self, htmlsrc):
73 """Returns the encoding of the html text. 74 75 We look for 'charset=' within a meta tag to do this. 76 """ 77 78 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']''' 79 result = re.findall(pattern, htmlsrc) 80 encoding = None 81 if result: 82 encoding = result[0] 83 return encoding
84
85 - def do_encoding(self, htmlsrc):
86 """Return the html text properly encoded based on a charset.""" 87 charset = self.guess_encoding(htmlsrc) 88 if charset: 89 return htmlsrc.decode(charset) 90 else: 91 return htmlsrc
92
93 - def phprep(self, text):
94 """Replaces all instances of PHP with placeholder tags, and returns 95 the new text and a dictionary of tags. The current implementation 96 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions 97 are stored in self.phpdict for later use in restoring the real PHP. 98 99 The purpose of this is to remove all potential "tag-like" code from 100 inside PHP. The hash looks nothing like an HTML tag, but the following 101 PHP:: 102 $a < $b ? $c : ($d > $e ? $f : $g) 103 looks like it contains an HTML tag:: 104 < $b ? $c : ($d > 105 to nearly any regex. Hence, we replace all contents of PHP with simple 106 strings to help our regexes out. 107 108 """ 109 110 from translate.misc import hash 111 112 self.phpdict = {} 113 result = re.findall('(?s)<\?(.*?)\?>', text) 114 for cmd in result: 115 h = hash.md5_f(cmd).hexdigest() 116 self.phpdict[h] = cmd 117 text = text.replace(cmd, h) 118 return text
119
120 - def reintrophp(self, text):
121 """Replaces the PHP placeholders in text with the real code""" 122 for hash, code in self.phpdict.items(): 123 text = text.replace(hash, code) 124 return text
125
126 - def parse(self, htmlsrc):
127 htmlsrc = self.do_encoding(htmlsrc) 128 htmlsrc = self.phprep(htmlsrc) #Clear out the PHP before parsing 129 self.feed(htmlsrc)
130
131 - def addhtmlblock(self, text):
132 text = self.strip_html(text) 133 text = self.reintrophp(text) #Before adding anything, restore PHP 134 if self.has_translatable_content(text): 135 self.currentblocknum += 1 136 unit = self.addsourceunit(text) 137 unit.addlocation("%s:%d" % (self.filename, self.currentblocknum)) 138 unit.addnote(self.currentcomment)
139
140 - def strip_html(self, text):
141 """Strip unnecessary html from the text. 142 143 HTML tags are deemed unnecessary if it fully encloses the translatable 144 text, eg. '<a href="index.html">Home Page</a>'. 145 146 HTML tags that occurs within the normal flow of text will not be removed, 147 eg. 'This is a link to the <a href="index.html">Home Page</a>.' 148 """ 149 text = text.strip() 150 151 # If all that is left is PHP, return "" 152 result = re.findall('(?s)^<\?.*?\?>$', text) 153 if len(result) == 1: 154 return "" 155 156 # These two patterns are the same; the first one is more concise... 157 #pattern = '(?s)^<[^?>](?:(?:[^>]|(?:<\?.*?\?>))*[^?>])?>(.*)</.*[^?]>$' 158 pattern = re.compile(r''' 159 (?s)^ # We allow newlines, and match start of line 160 <[^?>] # Match start of tag and the first character (not ? or >) 161 (?: 162 (?: 163 [^>] # Anything that's not a > is valid tag material 164 | 165 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid 166 )* # Repeat over valid tag material 167 [^?>] # If we have > 1 char, the last char can't be ? or > 168 )? # The repeated chars are optional, so that <a>, <p> work 169 > # Match ending > of opening tag 170 171 (.*) # Match actual contents of tag 172 173 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char 174 $ # Match end of line 175 ''', re.VERBOSE) 176 result = re.findall(pattern, text) 177 if len(result) == 1: 178 text = self.strip_html(result[0]) 179 return text
180
181 - def has_translatable_content(self, text):
182 """Check if the supplied HTML snippet has any content that needs to be translated.""" 183 184 text = text.strip() 185 result = re.findall('(?i).*(charset.*=.*)', text) 186 if len(result) == 1: 187 return False 188 189 # TODO: Get a better way to find untranslatable entities. 190 if text == '&nbsp;': 191 return False 192 193 pattern = '<\?.*?\?>' # Lazily strip all PHP 194 result = re.sub(pattern, '', text).strip() 195 pattern = '<[^>]*>' #Strip all HTML tags 196 result = re.sub(pattern, '', result).strip() 197 if result: 198 return True 199 else: 200 return False
201 202 #From here on below, follows the methods of the HTMLParser 203
204 - def startblock(self, tag):
205 self.addhtmlblock(self.currentblock) 206 self.currentblock = "" 207 self.currentcomment = "" 208 self.currenttag = tag
209
210 - def endblock(self):
211 self.addhtmlblock(self.currentblock) 212 self.currentblock = "" 213 self.currentcomment = "" 214 self.currenttag = None
215
216 - def handle_starttag(self, tag, attrs):
217 newblock = 0 218 if tag in self.markingtags: 219 newblock = 1 220 for attrname, attrvalue in attrs: 221 if attrname in self.markingattrs: 222 newblock = 1 223 if attrname in self.includeattrs: 224 self.addhtmlblock(attrvalue) 225 226 if newblock: 227 self.startblock(tag) 228 elif self.currenttag is not None: 229 self.currentblock += self.get_starttag_text()
230
231 - def handle_startendtag(self, tag, attrs):
232 for attrname, attrvalue in attrs: 233 if attrname in self.includeattrs: 234 self.addhtmlblock(attrvalue) 235 if self.currenttag is not None: 236 self.currentblock += self.get_starttag_text()
237
238 - def handle_endtag(self, tag):
239 if tag == self.currenttag: 240 self.endblock() 241 elif self.currenttag is not None: 242 self.currentblock += '</%s>' % tag
243
244 - def handle_data(self, data):
245 if self.currenttag is not None: 246 self.currentblock += data 247 elif self.includeuntaggeddata: 248 self.startblock(None) 249 self.currentblock += data
250
251 - def handle_charref(self, name):
252 self.handle_data("&#%s;" % name)
253
254 - def handle_entityref(self, name):
255 self.handle_data("&%s;" % name)
256
257 - def handle_comment(self, data):
258 # we can place comments above the msgid as translator comments! 259 if self.currentcomment == "": 260 self.currentcomment = data 261 else: 262 self.currentcomment += '\n' + data
263
264 - def handle_pi(self, data):
265 self.handle_data("<?%s>" % data)
266
267 -class POHTMLParser(htmlfile):
268 pass
269