# HG changeset patch
# User Rémi Cardona <remi.cardona@free.fr>
# Date 1412033306 -7200
# Tue Sep 30 01:28:26 2014 +0200
# Node ID fda99736b65fdf471eba268d056854f458a0cfa1
# Parent 56e26715c09be0ab51692fbc88b64d9a5c75a993
[transforms] Remove bundled copy of html2text
Upstream is already py3k compatible. Rename html2text module to prevent
import conflicts with system html2text. Closes #268147.
# User Rémi Cardona <remi.cardona@free.fr>
# Date 1412033306 -7200
# Tue Sep 30 01:28:26 2014 +0200
# Node ID fda99736b65fdf471eba268d056854f458a0cfa1
# Parent 56e26715c09be0ab51692fbc88b64d9a5c75a993
[transforms] Remove bundled copy of html2text
Upstream is already py3k compatible. Rename html2text module to prevent
import conflicts with system html2text. Closes #268147.
@@ -260,11 +260,11 @@
1 2 def register_base_transforms(engine, verb=True): 3 from logilab.mtconverter.transforms import cmdtransforms, text_to_text, \ 4 xml_to_text, text_to_html, xlog_to_html 5 from logilab.mtconverter.transforms.python import python_to_html 6 - from logilab.mtconverter.transforms.html2text import html_to_formatted_text 7 + from logilab.mtconverter.transforms.htmltransform import html_to_formatted_text 8 from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text 9 from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text 10 engine.add_transform(text_to_text()) 11 engine.add_transform(xml_to_text()) 12 engine.add_transform(text_to_html())
@@ -33,6 +33,7 @@
13 author = "Sylvain Thenault" 14 author_email = "contact@logilab.fr" 15 16 install_requires = [ 17 'six >= 1.4.0', 18 + 'html2text', 19 ]
@@ -39,11 +39,11 @@
20 converted = ENGINE.convert(data, 'text/plain').decode().strip() 21 self.assertEqual(converted, u'**yo (zou �� ;)**') 22 23 data = TransformData(u'<p>yo <br/>zogzog </p>', 'text/html', 'utf8') 24 converted = ENGINE.convert(data, 'text/plain').decode().strip() 25 - self.assertEqual(converted, u'yo\n\nzogzog') 26 + self.assertEqual(converted, u'yo \nzogzog') 27 28 def test_html_to_text_noenc(self): 29 self.skipTest('Encoding detection with chardet does not work') 30 # will trigger guess_encoding, check non-utf8 encoding 31 data = TransformData(u"<b>yo (l'�tat � l'oeuf)</b>".encode('latin1'), 'text/html')
@@ -13,458 +13,21 @@
32 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License 33 # for more details. 34 # 35 # You should have received a copy of the GNU Lesser General Public License along 36 # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>. 37 -"""html2text: Turn HTML into equivalent Markdown-structured text. 38 39 -There is some specific mtconvter code at the end to define the 40 -html to text transformation. 41 - 42 -Copyright (C) 2004-2008 Aaron Swartz. GNU GPL 3. 43 -Copyright (C) 2008 Logilab S.A. 44 -""" 45 - 46 -__version__ = "2.38" 47 -__author__ = "Aaron Swartz (me@aaronsw.com)" 48 -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3." 49 -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"] 50 - 51 -# TODO: 52 -# Support decoded entities with unifiable. 53 - 54 -if not hasattr(__builtins__, 'True'): True, False = 1, 0 55 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types 56 -import sgmllib 57 -import urlparse 58 -sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]') 59 - 60 -try: from textwrap import wrap 61 -except: pass 62 - 63 -# Use Unicode characters instead of their ascii psuedo-replacements 64 -UNICODE_SNOB = 0 65 - 66 -# Put the links after each paragraph instead of at the end. 67 -LINKS_EACH_PARAGRAPH = 0 68 - 69 -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.) 70 -BODY_WIDTH = 78 71 - 72 -# Don't show internal links (href="#local-anchor") -- corresponding link targets 73 -# won't be visible in the plain text file anyway. 74 -SKIP_INTERNAL_LINKS = False 75 - 76 -### Entity Nonsense ### 77 - 78 -def name2cp(k): 79 - if k == 'apos': return ord("'") 80 - if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 81 - return htmlentitydefs.name2codepoint[k] 82 - else: 83 - k = htmlentitydefs.entitydefs[k] 84 - if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1 85 - return ord(codecs.latin_1_decode(k)[0]) 86 - 87 -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 88 -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*', 89 -'ndash':'-', 'oelig':'oe', 'aelig':'ae', 90 -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 91 -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 92 -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i', 93 -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 94 -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'} 95 - 96 -unifiable_n = {} 97 - 98 -for k in unifiable.keys(): 99 - unifiable_n[name2cp(k)] = unifiable[k] 100 - 101 -def charref(name): 102 - if name[0] in ['x','X']: 103 - c = int(name[1:], 16) 104 - else: 105 - c = int(name) 106 - 107 - if not UNICODE_SNOB and c in unifiable_n.keys(): 108 - return unifiable_n[c] 109 - else: 110 - return unichr(c) 111 - 112 -def entityref(c): 113 - if not UNICODE_SNOB and c in unifiable.keys(): 114 - return unifiable[c] 115 - else: 116 - try: name2cp(c) 117 - except KeyError: return "&" + c 118 - else: return unichr(name2cp(c)) 119 - 120 -def replaceEntities(s): 121 - s = s.group(1) 122 - if s[0] == "#": 123 - return charref(s[1:]) 124 - else: return entityref(s) 125 - 126 -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));") 127 -def unescape(s): 128 - return r_unescape.sub(replaceEntities, s) 129 - 130 -def fixattrs(attrs): 131 - # Fix bug in sgmllib.py 132 - if not attrs: return attrs 133 - newattrs = [] 134 - for attr in attrs: 135 - newattrs.append((attr[0], unescape(attr[1]))) 136 - return newattrs 137 - 138 -### End Entity Nonsense ### 139 - 140 -def onlywhite(line): 141 - """Return true if the line does only consist of whitespace characters.""" 142 - for c in line: 143 - if c is not ' ' and c is not ' ': 144 - return c is ' ' 145 - return line 146 - 147 -def optwrap(text): 148 - """Wrap all paragraphs in the provided text.""" 149 - if not BODY_WIDTH: 150 - return text 151 - 152 - assert wrap, "Requires Python 2.3." 153 - result = '' 154 - newlines = 0 155 - for para in text.split("\n"): 156 - if len(para) > 0: 157 - if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*': 158 - for line in wrap(para, BODY_WIDTH): 159 - result += line + "\n" 160 - result += "\n" 161 - newlines = 2 162 - else: 163 - if not onlywhite(para): 164 - result += para + "\n" 165 - newlines = 1 166 - else: 167 - if newlines < 2: 168 - result += "\n" 169 - newlines += 1 170 - return result 171 - 172 -def hn(tag): 173 - if tag[0] == 'h' and len(tag) == 2: 174 - try: 175 - n = int(tag[1]) 176 - if n in range(1, 10): return n 177 - except ValueError: return 0 178 - 179 -class _html2text(sgmllib.SGMLParser): 180 - 181 - def __init__(self, out=None, baseurl='', encoding='utf8'): 182 - sgmllib.SGMLParser.__init__(self) 183 - 184 - if out is None: self.out = self.outtextf 185 - else: self.out = out 186 - self.outtext = [] 187 - self.quiet = 0 188 - self.p_p = 0 189 - self.outcount = 0 190 - self.start = 1 191 - self.space = 0 192 - self.a = [] 193 - self.astack = [] 194 - self.acount = 0 195 - self.list = [] 196 - self.blockquote = 0 197 - self.pre = 0 198 - self.startpre = 0 199 - self.lastWasNL = 0 200 - self.abbr_title = None # current abbreviation definition 201 - self.abbr_data = None # last inner HTML (for abbr being defined) 202 - self.abbr_list = {} # stack of abbreviations to write later 203 - self.baseurl = baseurl 204 - self._encoding = encoding 205 - 206 - def outtextf(self, s): 207 - if isinstance(s, str): 208 - s = unicode(s, self._encoding) 209 - self.outtext.append( s ) 210 - 211 - def close(self): 212 - sgmllib.SGMLParser.close(self) 213 - 214 - self.pbr() 215 - self.o('', 0, 'end') 216 - 217 - return ''.join(self.outtext) 218 - 219 - def handle_charref(self, c): 220 - self.o(charref(c)) 221 - 222 - def handle_entityref(self, c): 223 - self.o(entityref(c)) 224 - 225 - def unknown_starttag(self, tag, attrs): 226 - self.handle_tag(tag, attrs, 1) 227 - 228 - def unknown_endtag(self, tag): 229 - self.handle_tag(tag, None, 0) 230 - 231 - def previousIndex(self, attrs): 232 - """ returns the index of certain set of attributes (of a link) in the 233 - self.a list 234 - 235 - If the set of attributes is not found, returns None 236 - """ 237 - if not attrs.has_key('href'): return None 238 - 239 - i = -1 240 - for a in self.a: 241 - i += 1 242 - match = 0 243 - 244 - if a.has_key('href') and a['href'] == attrs['href']: 245 - if a.has_key('title') or attrs.has_key('title'): 246 - if (a.has_key('title') and attrs.has_key('title') and 247 - a['title'] == attrs['title']): 248 - match = True 249 - else: 250 - match = True 251 - 252 - if match: return i 253 - 254 - def handle_tag(self, tag, attrs, start): 255 - attrs = fixattrs(attrs) 256 - 257 - if hn(tag): 258 - self.p() 259 - if start: self.o(hn(tag)*"#" + ' ') 260 - 261 - if tag in ['p', 'div']: self.p() 262 - 263 - if tag == "br" and start: self.o(" \n") 264 - 265 - if tag == "hr" and start: 266 - self.p() 267 - self.o("* * *") 268 - self.p() 269 - 270 - if tag in ["head", "style", 'script']: 271 - if start: self.quiet += 1 272 - else: self.quiet -= 1 273 - 274 - if tag in ["body"]: 275 - self.quiet = 0 # sites like 9rules.com never close <head> 276 - 277 - if tag == "blockquote": 278 - if start: 279 - self.p(); self.o('> ', 0, 1); self.start = 1 280 - self.blockquote += 1 281 - else: 282 - self.blockquote -= 1 283 - self.p() 284 - 285 - if tag in ['em', 'i', 'u']: self.o("_") 286 - if tag in ['strong', 'b']: self.o("**") 287 - if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` `` 288 - if tag == "abbr": 289 - if start: 290 - attrsD = {} 291 - for (x, y) in attrs: attrsD[x] = y 292 - attrs = attrsD 293 - 294 - self.abbr_title = None 295 - self.abbr_data = '' 296 - if attrs.has_key('title'): 297 - self.abbr_title = attrs['title'] 298 - else: 299 - if self.abbr_title != None: 300 - self.abbr_list[self.abbr_data] = self.abbr_title 301 - self.abbr_title = None 302 - self.abbr_data = '' 303 - 304 - if tag == "a": 305 - if start: 306 - attrsD = {} 307 - for (x, y) in attrs: attrsD[x] = y 308 - attrs = attrsD 309 - if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')): 310 - self.astack.append(attrs) 311 - self.o("[") 312 - else: 313 - self.astack.append(None) 314 - else: 315 - if self.astack: 316 - a = self.astack.pop() 317 - if a: 318 - i = self.previousIndex(a) 319 - if i is not None: 320 - a = self.a[i] 321 - else: 322 - self.acount += 1 323 - a['count'] = self.acount 324 - a['outcount'] = self.outcount 325 - self.a.append(a) 326 - self.o("][" + `a['count']` + "]") 327 - 328 - if tag == "img" and start: 329 - attrsD = {} 330 - for (x, y) in attrs: attrsD[x] = y 331 - attrs = attrsD 332 - if attrs.has_key('src'): 333 - attrs['href'] = attrs['src'] 334 - alt = attrs.get('alt', '') 335 - i = self.previousIndex(attrs) 336 - if i is not None: 337 - attrs = self.a[i] 338 - else: 339 - self.acount += 1 340 - attrs['count'] = self.acount 341 - attrs['outcount'] = self.outcount 342 - self.a.append(attrs) 343 - self.o("![") 344 - self.o(alt) 345 - self.o("]["+`attrs['count']`+"]") 346 - 347 - if tag == 'dl' and start: self.p() 348 - if tag == 'dt' and not start: self.pbr() 349 - if tag == 'dd' and start: self.o(' ') 350 - if tag == 'dd' and not start: self.pbr() 351 - 352 - if tag in ["ol", "ul"]: 353 - if start: 354 - self.list.append({'name':tag, 'num':0}) 355 - else: 356 - if self.list: self.list.pop() 357 - 358 - self.p() 359 - 360 - if tag == 'li': 361 - if start: 362 - self.pbr() 363 - if self.list: li = self.list[-1] 364 - else: li = {'name':'ul', 'num':0} 365 - self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly. 366 - if li['name'] == "ul": self.o("* ") 367 - elif li['name'] == "ol": 368 - li['num'] += 1 369 - self.o(`li['num']`+". ") 370 - self.start = 1 371 - else: 372 - self.pbr() 373 - 374 - if tag in ["table", "tr"] and start: self.p() 375 - if tag == 'td': self.pbr() 376 - 377 - if tag == "pre": 378 - if start: 379 - self.startpre = 1 380 - self.pre = 1 381 - else: 382 - self.pre = 0 383 - self.p() 384 - 385 - def pbr(self): 386 - if self.p_p == 0: self.p_p = 1 387 - 388 - def p(self): self.p_p = 2 389 - 390 - def o(self, data, puredata=0, force=0): 391 - if self.abbr_data is not None: self.abbr_data += data 392 - 393 - if not self.quiet: 394 - if puredata and not self.pre: 395 - data = re.sub('\s+', ' ', data) 396 - if data and data[0] == ' ': 397 - self.space = 1 398 - data = data[1:] 399 - if not data and not force: return 400 - 401 - if self.startpre: 402 - #self.out(" :") #TODO: not output when already one there 403 - self.startpre = 0 404 - 405 - bq = (">" * self.blockquote) 406 - if not (force and data and data[0] == ">") and self.blockquote: bq += " " 407 - 408 - if self.pre: 409 - bq += " " 410 - data = data.replace("\n", "\n"+bq) 411 - 412 - if self.start: 413 - self.space = 0 414 - self.p_p = 0 415 - self.start = 0 416 - 417 - if force == 'end': 418 - # It's the end. 419 - self.p_p = 0 420 - self.out("\n") 421 - self.space = 0 422 - 423 - 424 - if self.p_p: 425 - self.out(('\n'+bq)*self.p_p) 426 - self.space = 0 427 - 428 - if self.space: 429 - if not self.lastWasNL: self.out(' ') 430 - self.space = 0 431 - 432 - if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): 433 - if force == "end": self.out("\n") 434 - 435 - newa = [] 436 - for link in self.a: 437 - if self.outcount > link['outcount']: 438 - self.out(" ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href'])) 439 - if link.has_key('title'): self.out(" ("+link['title']+")") 440 - self.out("\n") 441 - else: 442 - newa.append(link) 443 - 444 - if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. 445 - 446 - self.a = newa 447 - 448 - if self.abbr_list and force == "end": 449 - for abbr, definition in self.abbr_list.items(): 450 - self.out(" *[" + abbr + "]: " + definition + "\n") 451 - 452 - self.p_p = 0 453 - self.out(data) 454 - self.lastWasNL = data and data[-1] == '\n' 455 - self.outcount += 1 456 - 457 - def handle_data(self, data): 458 - if r'\/script>' in data: self.quiet -= 1 459 - self.o(data, 1) 460 - 461 - def unknown_decl(self, data): pass 462 - 463 -def wrapwrite(text): sys.stdout.write(text.encode('utf8')) 464 - 465 -def html2text_file(html, out=wrapwrite, baseurl='', encoding='utf8'): 466 - h = _html2text(out, baseurl, encoding=encoding) 467 - h.feed(html) 468 - h.feed("") 469 - return h.close() 470 - 471 -def html2text(html, baseurl='', encoding='utf8'): 472 - return optwrap(html2text_file(html.replace('/>', '>'), None, 473 - baseurl, encoding=encoding)) 474 - 475 - 476 -## mtconverter's specific code ################################################ 477 +from html2text import html2text 478 479 from logilab.mtconverter.transform import Transform 480 481 + 482 class html_to_formatted_text(Transform): 483 """transforms html to formatted plain text""" 484 485 name = "html_to_text" 486 inputs = ("text/html",) 487 output = "text/plain" 488 489 490 def _convert(self, trdata): 491 - return html2text(trdata.data, encoding=trdata.encoding) 492 + return html2text(trdata.data).encode(trdata.encoding)