[transforms] Remove bundled copy of html2text

Upstream is already py3k compatible. Rename html2text module to prevent import conflicts with system html2text. Closes #268147.

authorRémi Cardona <remi.cardona@free.fr>
changesetfda99736b65f
branchdefault
phasepublic
hiddenno
parent revision#56e26715c09b [py3k] deal with unicode/str mess
child revision#7d28ad6a4aff [py3k] Fix odt2text to work properly with binary data
files modified by this revision
__init__.py
__pkginfo__.py
test/unittest_transforms.py
transforms/html2text.py
transforms/htmltransform.py
# HG changeset patch
# User Rémi Cardona <remi.cardona@free.fr>
# Date 1412033306 -7200
# Tue Sep 30 01:28:26 2014 +0200
# Node ID fda99736b65fdf471eba268d056854f458a0cfa1
# Parent 56e26715c09be0ab51692fbc88b64d9a5c75a993
[transforms] Remove bundled copy of html2text

Upstream is already py3k compatible. Rename html2text module to prevent
import conflicts with system html2text. Closes #268147.

diff --git a/__init__.py b/__init__.py
@@ -260,11 +260,11 @@
1 
2  def register_base_transforms(engine, verb=True):
3      from logilab.mtconverter.transforms import cmdtransforms, text_to_text, \
4           xml_to_text, text_to_html, xlog_to_html
5      from logilab.mtconverter.transforms.python import python_to_html
6 -    from logilab.mtconverter.transforms.html2text import html_to_formatted_text
7 +    from logilab.mtconverter.transforms.htmltransform import html_to_formatted_text
8      from logilab.mtconverter.transforms.odt2text import odt_to_unformatted_text
9      from logilab.mtconverter.transforms.pgpsignature import pgpsignature_to_text
10      engine.add_transform(text_to_text())
11      engine.add_transform(xml_to_text())
12      engine.add_transform(text_to_html())
diff --git a/__pkginfo__.py b/__pkginfo__.py
@@ -33,6 +33,7 @@
13  author = "Sylvain Thenault"
14  author_email = "contact@logilab.fr"
15 
16  install_requires = [
17      'six >= 1.4.0',
18 +    'html2text',
19      ]
diff --git a/test/unittest_transforms.py b/test/unittest_transforms.py
@@ -39,11 +39,11 @@
20          converted = ENGINE.convert(data, 'text/plain').decode().strip()
21          self.assertEqual(converted, u'**yo (zou �� ;)**')
22 
23          data = TransformData(u'<p>yo <br/>zogzog </p>', 'text/html', 'utf8')
24          converted = ENGINE.convert(data, 'text/plain').decode().strip()
25 -        self.assertEqual(converted, u'yo\n\nzogzog')
26 +        self.assertEqual(converted, u'yo  \nzogzog')
27 
28      def test_html_to_text_noenc(self):
29          self.skipTest('Encoding detection with chardet does not work')
30          # will trigger guess_encoding, check non-utf8 encoding
31          data = TransformData(u"<b>yo (l'�tat � l'oeuf)</b>".encode('latin1'), 'text/html')
diff --git a/transforms/html2text.py b/transforms/htmltransform.py
@@ -13,458 +13,21 @@
32  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
33  # for more details.
34  #
35  # You should have received a copy of the GNU Lesser General Public License along
36  # with logilab-mtconverter. If not, see <http://www.gnu.org/licenses/>.
37 -"""html2text: Turn HTML into equivalent Markdown-structured text.
38 
39 -There is some specific mtconvter code at the end to define the
40 -html to text transformation.
41 -
42 -Copyright (C) 2004-2008 Aaron Swartz. GNU GPL 3.
43 -Copyright (C) 2008 Logilab S.A.
44 -"""
45 -
46 -__version__ = "2.38"
47 -__author__ = "Aaron Swartz (me@aaronsw.com)"
48 -__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
49 -__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
50 -
51 -# TODO:
52 -#   Support decoded entities with unifiable.
53 -
54 -if not hasattr(__builtins__, 'True'): True, False = 1, 0
55 -import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
56 -import sgmllib
57 -import urlparse
58 -sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
59 -
60 -try: from textwrap import wrap
61 -except: pass
62 -
63 -# Use Unicode characters instead of their ascii psuedo-replacements
64 -UNICODE_SNOB = 0
65 -
66 -# Put the links after each paragraph instead of at the end.
67 -LINKS_EACH_PARAGRAPH = 0
68 -
69 -# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
70 -BODY_WIDTH = 78
71 -
72 -# Don't show internal links (href="#local-anchor") -- corresponding link targets
73 -# won't be visible in the plain text file anyway.
74 -SKIP_INTERNAL_LINKS = False
75 -
76 -### Entity Nonsense ###
77 -
78 -def name2cp(k):
79 -    if k == 'apos': return ord("'")
80 -    if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
81 -        return htmlentitydefs.name2codepoint[k]
82 -    else:
83 -        k = htmlentitydefs.entitydefs[k]
84 -        if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
85 -        return ord(codecs.latin_1_decode(k)[0])
86 -
87 -unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
88 -'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
89 -'ndash':'-', 'oelig':'oe', 'aelig':'ae',
90 -'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
91 -'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
92 -'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
93 -'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
94 -'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
95 -
96 -unifiable_n = {}
97 -
98 -for k in unifiable.keys():
99 -    unifiable_n[name2cp(k)] = unifiable[k]
100 -
101 -def charref(name):
102 -    if name[0] in ['x','X']:
103 -        c = int(name[1:], 16)
104 -    else:
105 -        c = int(name)
106 -
107 -    if not UNICODE_SNOB and c in unifiable_n.keys():
108 -        return unifiable_n[c]
109 -    else:
110 -        return unichr(c)
111 -
112 -def entityref(c):
113 -    if not UNICODE_SNOB and c in unifiable.keys():
114 -        return unifiable[c]
115 -    else:
116 -        try: name2cp(c)
117 -        except KeyError: return "&" + c
118 -        else: return unichr(name2cp(c))
119 -
120 -def replaceEntities(s):
121 -    s = s.group(1)
122 -    if s[0] == "#":
123 -        return charref(s[1:])
124 -    else: return entityref(s)
125 -
126 -r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
127 -def unescape(s):
128 -    return r_unescape.sub(replaceEntities, s)
129 -
130 -def fixattrs(attrs):
131 -    # Fix bug in sgmllib.py
132 -    if not attrs: return attrs
133 -    newattrs = []
134 -    for attr in attrs:
135 -        newattrs.append((attr[0], unescape(attr[1])))
136 -    return newattrs
137 -
138 -### End Entity Nonsense ###
139 -
140 -def onlywhite(line):
141 -    """Return true if the line does only consist of whitespace characters."""
142 -    for c in line:
143 -        if c is not ' ' and c is not '  ':
144 -            return c is ' '
145 -    return line
146 -
147 -def optwrap(text):
148 -    """Wrap all paragraphs in the provided text."""
149 -    if not BODY_WIDTH:
150 -        return text
151 -
152 -    assert wrap, "Requires Python 2.3."
153 -    result = ''
154 -    newlines = 0
155 -    for para in text.split("\n"):
156 -        if len(para) > 0:
157 -            if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
158 -                for line in wrap(para, BODY_WIDTH):
159 -                    result += line + "\n"
160 -                result += "\n"
161 -                newlines = 2
162 -            else:
163 -                if not onlywhite(para):
164 -                    result += para + "\n"
165 -                    newlines = 1
166 -        else:
167 -            if newlines < 2:
168 -                result += "\n"
169 -                newlines += 1
170 -    return result
171 -
172 -def hn(tag):
173 -    if tag[0] == 'h' and len(tag) == 2:
174 -        try:
175 -            n = int(tag[1])
176 -            if n in range(1, 10): return n
177 -        except ValueError: return 0
178 -
179 -class _html2text(sgmllib.SGMLParser):
180 -
181 -    def __init__(self, out=None, baseurl='', encoding='utf8'):
182 -        sgmllib.SGMLParser.__init__(self)
183 -
184 -        if out is None: self.out = self.outtextf
185 -        else: self.out = out
186 -        self.outtext = []
187 -        self.quiet = 0
188 -        self.p_p = 0
189 -        self.outcount = 0
190 -        self.start = 1
191 -        self.space = 0
192 -        self.a = []
193 -        self.astack = []
194 -        self.acount = 0
195 -        self.list = []
196 -        self.blockquote = 0
197 -        self.pre = 0
198 -        self.startpre = 0
199 -        self.lastWasNL = 0
200 -        self.abbr_title = None # current abbreviation definition
201 -        self.abbr_data = None # last inner HTML (for abbr being defined)
202 -        self.abbr_list = {} # stack of abbreviations to write later
203 -        self.baseurl = baseurl
204 -        self._encoding = encoding
205 -
206 -    def outtextf(self, s):
207 -        if isinstance(s, str):
208 -            s = unicode(s, self._encoding)
209 -        self.outtext.append( s )
210 -
211 -    def close(self):
212 -        sgmllib.SGMLParser.close(self)
213 -
214 -        self.pbr()
215 -        self.o('', 0, 'end')
216 -
217 -        return ''.join(self.outtext)
218 -
219 -    def handle_charref(self, c):
220 -        self.o(charref(c))
221 -
222 -    def handle_entityref(self, c):
223 -        self.o(entityref(c))
224 -
225 -    def unknown_starttag(self, tag, attrs):
226 -        self.handle_tag(tag, attrs, 1)
227 -
228 -    def unknown_endtag(self, tag):
229 -        self.handle_tag(tag, None, 0)
230 -
231 -    def previousIndex(self, attrs):
232 -        """ returns the index of certain set of attributes (of a link) in the
233 -            self.a list
234 -
235 -            If the set of attributes is not found, returns None
236 -        """
237 -        if not attrs.has_key('href'): return None
238 -
239 -        i = -1
240 -        for a in self.a:
241 -            i += 1
242 -            match = 0
243 -
244 -            if a.has_key('href') and a['href'] == attrs['href']:
245 -                if a.has_key('title') or attrs.has_key('title'):
246 -                        if (a.has_key('title') and attrs.has_key('title') and
247 -                            a['title'] == attrs['title']):
248 -                            match = True
249 -                else:
250 -                    match = True
251 -
252 -            if match: return i
253 -
254 -    def handle_tag(self, tag, attrs, start):
255 -        attrs = fixattrs(attrs)
256 -
257 -        if hn(tag):
258 -            self.p()
259 -            if start: self.o(hn(tag)*"#" + ' ')
260 -
261 -        if tag in ['p', 'div']: self.p()
262 -
263 -        if tag == "br" and start: self.o("  \n")
264 -
265 -        if tag == "hr" and start:
266 -            self.p()
267 -            self.o("* * *")
268 -            self.p()
269 -
270 -        if tag in ["head", "style", 'script']:
271 -            if start: self.quiet += 1
272 -            else: self.quiet -= 1
273 -
274 -        if tag in ["body"]:
275 -            self.quiet = 0 # sites like 9rules.com never close <head>
276 -
277 -        if tag == "blockquote":
278 -            if start:
279 -                self.p(); self.o('> ', 0, 1); self.start = 1
280 -                self.blockquote += 1
281 -            else:
282 -                self.blockquote -= 1
283 -                self.p()
284 -
285 -        if tag in ['em', 'i', 'u']: self.o("_")
286 -        if tag in ['strong', 'b']: self.o("**")
287 -        if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
288 -        if tag == "abbr":
289 -            if start:
290 -                attrsD = {}
291 -                for (x, y) in attrs: attrsD[x] = y
292 -                attrs = attrsD
293 -
294 -                self.abbr_title = None
295 -                self.abbr_data = ''
296 -                if attrs.has_key('title'):
297 -                    self.abbr_title = attrs['title']
298 -            else:
299 -                if self.abbr_title != None:
300 -                    self.abbr_list[self.abbr_data] = self.abbr_title
301 -                    self.abbr_title = None
302 -                self.abbr_data = ''
303 -
304 -        if tag == "a":
305 -            if start:
306 -                attrsD = {}
307 -                for (x, y) in attrs: attrsD[x] = y
308 -                attrs = attrsD
309 -                if attrs.has_key('href') and not (SKIP_INTERNAL_LINKS and attrs['href'].startswith('#')):
310 -                    self.astack.append(attrs)
311 -                    self.o("[")
312 -                else:
313 -                    self.astack.append(None)
314 -            else:
315 -                if self.astack:
316 -                    a = self.astack.pop()
317 -                    if a:
318 -                        i = self.previousIndex(a)
319 -                        if i is not None:
320 -                            a = self.a[i]
321 -                        else:
322 -                            self.acount += 1
323 -                            a['count'] = self.acount
324 -                            a['outcount'] = self.outcount
325 -                            self.a.append(a)
326 -                        self.o("][" + `a['count']` + "]")
327 -
328 -        if tag == "img" and start:
329 -            attrsD = {}
330 -            for (x, y) in attrs: attrsD[x] = y
331 -            attrs = attrsD
332 -            if attrs.has_key('src'):
333 -                attrs['href'] = attrs['src']
334 -                alt = attrs.get('alt', '')
335 -                i = self.previousIndex(attrs)
336 -                if i is not None:
337 -                    attrs = self.a[i]
338 -                else:
339 -                    self.acount += 1
340 -                    attrs['count'] = self.acount
341 -                    attrs['outcount'] = self.outcount
342 -                    self.a.append(attrs)
343 -                self.o("![")
344 -                self.o(alt)
345 -                self.o("]["+`attrs['count']`+"]")
346 -
347 -        if tag == 'dl' and start: self.p()
348 -        if tag == 'dt' and not start: self.pbr()
349 -        if tag == 'dd' and start: self.o('    ')
350 -        if tag == 'dd' and not start: self.pbr()
351 -
352 -        if tag in ["ol", "ul"]:
353 -            if start:
354 -                self.list.append({'name':tag, 'num':0})
355 -            else:
356 -                if self.list: self.list.pop()
357 -
358 -            self.p()
359 -
360 -        if tag == 'li':
361 -            if start:
362 -                self.pbr()
363 -                if self.list: li = self.list[-1]
364 -                else: li = {'name':'ul', 'num':0}
365 -                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
366 -                if li['name'] == "ul": self.o("* ")
367 -                elif li['name'] == "ol":
368 -                    li['num'] += 1
369 -                    self.o(`li['num']`+". ")
370 -                self.start = 1
371 -            else:
372 -                self.pbr()
373 -
374 -        if tag in ["table", "tr"] and start: self.p()
375 -        if tag == 'td': self.pbr()
376 -
377 -        if tag == "pre":
378 -            if start:
379 -                self.startpre = 1
380 -                self.pre = 1
381 -            else:
382 -                self.pre = 0
383 -            self.p()
384 -
385 -    def pbr(self):
386 -        if self.p_p == 0: self.p_p = 1
387 -
388 -    def p(self): self.p_p = 2
389 -
390 -    def o(self, data, puredata=0, force=0):
391 -        if self.abbr_data is not None: self.abbr_data += data
392 -
393 -        if not self.quiet:
394 -            if puredata and not self.pre:
395 -                data = re.sub('\s+', ' ', data)
396 -                if data and data[0] == ' ':
397 -                    self.space = 1
398 -                    data = data[1:]
399 -            if not data and not force: return
400 -
401 -            if self.startpre:
402 -                #self.out(" :") #TODO: not output when already one there
403 -                self.startpre = 0
404 -
405 -            bq = (">" * self.blockquote)
406 -            if not (force and data and data[0] == ">") and self.blockquote: bq += " "
407 -
408 -            if self.pre:
409 -                bq += "    "
410 -                data = data.replace("\n", "\n"+bq)
411 -
412 -            if self.start:
413 -                self.space = 0
414 -                self.p_p = 0
415 -                self.start = 0
416 -
417 -            if force == 'end':
418 -                # It's the end.
419 -                self.p_p = 0
420 -                self.out("\n")
421 -                self.space = 0
422 -
423 -
424 -            if self.p_p:
425 -                self.out(('\n'+bq)*self.p_p)
426 -                self.space = 0
427 -
428 -            if self.space:
429 -                if not self.lastWasNL: self.out(' ')
430 -                self.space = 0
431 -
432 -            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
433 -                if force == "end": self.out("\n")
434 -
435 -                newa = []
436 -                for link in self.a:
437 -                    if self.outcount > link['outcount']:
438 -                        self.out("   ["+`link['count']`+"]: " + urlparse.urljoin(self.baseurl, link['href']))
439 -                        if link.has_key('title'): self.out(" ("+link['title']+")")
440 -                        self.out("\n")
441 -                    else:
442 -                        newa.append(link)
443 -
444 -                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
445 -
446 -                self.a = newa
447 -
448 -            if self.abbr_list and force == "end":
449 -                for abbr, definition in self.abbr_list.items():
450 -                    self.out("  *[" + abbr + "]: " + definition + "\n")
451 -
452 -            self.p_p = 0
453 -            self.out(data)
454 -            self.lastWasNL = data and data[-1] == '\n'
455 -            self.outcount += 1
456 -
457 -    def handle_data(self, data):
458 -        if r'\/script>' in data: self.quiet -= 1
459 -        self.o(data, 1)
460 -
461 -    def unknown_decl(self, data): pass
462 -
463 -def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
464 -
465 -def html2text_file(html, out=wrapwrite, baseurl='', encoding='utf8'):
466 -    h = _html2text(out, baseurl, encoding=encoding)
467 -    h.feed(html)
468 -    h.feed("")
469 -    return h.close()
470 -
471 -def html2text(html, baseurl='', encoding='utf8'):
472 -    return optwrap(html2text_file(html.replace('/>', '>'), None,
473 -                                  baseurl, encoding=encoding))
474 -
475 -
476 -## mtconverter's specific code ################################################
477 +from html2text import html2text
478 
479  from logilab.mtconverter.transform import Transform
480 
481 +
482  class html_to_formatted_text(Transform):
483      """transforms html to formatted plain text"""
484 
485      name = "html_to_text"
486      inputs  = ("text/html",)
487      output = "text/plain"
488 
489 
490      def _convert(self, trdata):
491 -        return html2text(trdata.data, encoding=trdata.encoding)
492 +        return html2text(trdata.data).encode(trdata.encoding)