2 """html2text: Turn HTML into equivalent Markdown-structured text."""
3 __version__ = "2.35-Wireshark"
4 __author__ = "Aaron Swartz (me@aaronsw.com)"
5 __copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
6 __contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
9 # This is a modified version of html2text.py from http://www.aaronsw.com/2002/html2text/
11 # Options can now be configured from the command line.
12 # SKIP_LINKS and INPUT_ENCODING options have been added.
13 # The script now requires Python 2.3
16 # Support decoded entities with unifiable.
17 # Relative URL resolution
18 # Indent sections and lists similar to elinks/links/lynx
20 if not hasattr(__builtins__, 'True'): True, False = 1, 0
21 import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
23 sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
24 from optparse import OptionParser
26 try: from textwrap import wrap
29 oparser = OptionParser()
38 help="Use Unicode characters instead of their ascii psuedo-replacements. [default: False]",
42 "--links-after-paragraphs",
44 dest="LINKS_EACH_PARAGRAPH",
46 help="Put the links after each paragraph instead of at the end. [default: False]",
54 help="Wrap long lines at position. 0 for no wrapping. Requires Python 2.3. [default: 78 characters]",
58 "--no-internal-links",
60 dest="SKIP_INTERNAL_LINKS",
62 help='''Don't show internal links (href="#local-anchor"). Corresponding link targets won't be visible in the plain text file anyway. [default: False]''',
70 help='''Don't show links. [default: False]''',
76 dest="INPUT_ENCODING",
78 help='''Force the encoding of the input file. [default: utf-8]''',
81 ### Entity Nonsense ###
84 if k == 'apos': return ord("'")
85 if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
86 return htmlentitydefs.name2codepoint[k]
88 k = htmlentitydefs.entitydefs[k]
89 if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
90 return ord(codecs.latin_1_decode(k)[0])
92 unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
93 'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
94 'ndash':'-', 'oelig':'oe', 'aelig':'ae',
95 'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
96 'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
97 'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
98 'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
99 'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
103 for k in unifiable.keys():
104 unifiable_n[name2cp(k)] = unifiable[k]
109 if name[0] in ['x','X']:
110 c = int(name[1:], 16)
114 if not options.UNICODE_SNOB and c in unifiable_n.keys():
115 return unifiable_n[c]
122 if not options.UNICODE_SNOB and c in unifiable.keys():
126 except KeyError: return "&" + c
127 else: return unichr(name2cp(c))
129 def replaceEntities(s):
132 return charref(s[1:])
133 else: return entityref(s)
135 r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
137 return r_unescape.sub(replaceEntities, s)
140 # Fix bug in sgmllib.py
141 if not attrs: return attrs
144 newattrs.append((attr[0], unescape(attr[1])))
147 ### End Entity Nonsense ###
150 """Return true if the line does only consist of whitespace characters."""
152 if c is not ' ' and c is not ' ':
157 """Wrap all paragraphs in the provided text."""
159 if not options.BODY_WIDTH:
162 assert wrap, "Requires Python 2.3."
165 for para in text.split("\n"):
167 if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
168 for line in wrap(para, options.BODY_WIDTH):
169 result += line + "\n"
173 if not onlywhite(para):
174 result += para + "\n"
183 if tag[0] == 'h' and len(tag) == 2:
186 if n in range(1, 10): return n
187 except ValueError: return 0
189 class _html2text(sgmllib.SGMLParser):
190 def __init__(self, out=sys.stdout.write):
191 sgmllib.SGMLParser.__init__(self)
193 if out is None: self.out = self.outtextf
209 self.abbr_title = None # current abbreviation definition
210 self.abbr_data = None # last inner HTML (for abbr being defined)
211 self.abbr_list = {} # stack of abbreviations to write later
213 def outtextf(self, s):
217 sgmllib.SGMLParser.close(self)
224 def handle_charref(self, c):
227 def handle_entityref(self, c):
230 def unknown_starttag(self, tag, attrs):
231 self.handle_tag(tag, attrs, 1)
233 def unknown_endtag(self, tag):
234 self.handle_tag(tag, None, 0)
236 def previousIndex(self, attrs):
237 """ returns the index of certain set of attributes (of a link) in the
240 If the set of attributes is not found, returns None
242 if not attrs.has_key('href'): return None
249 if a.has_key('href') and a['href'] == attrs['href']:
250 if a.has_key('title') or attrs.has_key('title'):
251 if (a.has_key('title') and attrs.has_key('title') and
252 a['title'] == attrs['title']):
259 def handle_tag(self, tag, attrs, start):
261 attrs = fixattrs(attrs)
265 if start: self.o(hn(tag)*"#" + ' ')
267 if tag in ['p', 'div']: self.p()
269 if tag == "br" and start: self.o(" \n")
271 if tag == "hr" and start:
276 if tag in ["head", "style", 'script']:
277 if start: self.quiet += 1
278 else: self.quiet -= 1
281 self.quiet = 0 # sites like 9rules.com never close <head>
283 if tag == "blockquote":
285 self.p(); self.o('> ', 0, 1); self.start = 1
291 if tag in ['em', 'i', 'u']: self.o("_")
292 if tag in ['strong', 'b']: self.o("**")
293 if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
297 for (x, y) in attrs: attrsD[x] = y
300 self.abbr_title = None
302 if attrs.has_key('title'):
303 self.abbr_title = attrs['title']
305 if self.abbr_title != None:
306 self.abbr_list[self.abbr_data] = self.abbr_title
307 self.abbr_title = None
313 for (x, y) in attrs: attrsD[x] = y
315 if attrs.has_key('href') and not (options.SKIP_LINKS or (options.SKIP_INTERNAL_LINKS and attrs['href'].startswith('#'))):
316 self.astack.append(attrs)
319 self.astack.append(None)
322 a = self.astack.pop()
324 i = self.previousIndex(a)
329 a['count'] = self.acount
330 a['outcount'] = self.outcount
332 self.o("][" + `a['count']` + "]")
334 if tag == "img" and start:
336 for (x, y) in attrs: attrsD[x] = y
338 if attrs.has_key('src'):
339 attrs['href'] = attrs['src']
340 alt = attrs.get('alt', '')
341 i = self.previousIndex(attrs)
346 attrs['count'] = self.acount
347 attrs['outcount'] = self.outcount
351 self.o("]["+`attrs['count']`+"]")
353 if tag == 'dl' and start: self.p()
354 if tag == 'dt' and not start: self.pbr()
355 if tag == 'dd' and start: self.o(' ')
356 if tag == 'dd' and not start: self.pbr()
358 if tag in ["ol", "ul"]:
360 self.list.append({'name':tag, 'num':0})
362 if self.list: self.list.pop()
369 if self.list: li = self.list[-1]
370 else: li = {'name':'ul', 'num':0}
371 self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
372 if li['name'] == "ul": self.o("* ")
373 elif li['name'] == "ol":
375 self.o(`li['num']`+". ")
380 if tag in ["table", "tr"] and start: self.p()
381 if tag == 'td': self.pbr()
392 if self.p_p == 0: self.p_p = 1
394 def p(self): self.p_p = 2
396 def o(self, data, puredata=0, force=0):
397 if self.abbr_data is not None: self.abbr_data += data
400 if puredata and not self.pre:
401 data = re.sub('\s+', ' ', data)
402 if data and data[0] == ' ':
405 if not data and not force: return
408 #self.out(" :") #TODO: not output when already one there
411 bq = (">" * self.blockquote)
412 if not (force and data and data[0] == ">") and self.blockquote: bq += " "
416 data = data.replace("\n", "\n"+bq)
431 self.out(('\n'+bq)*self.p_p)
435 if not self.lastWasNL: self.out(' ')
438 if self.a and ((self.p_p == 2 and options.LINKS_EACH_PARAGRAPH) or force == "end"):
439 if force == "end": self.out("\n")
443 if self.outcount > link['outcount']:
444 self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
445 if link.has_key('title'): self.out(" ("+link['title']+")")
450 if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
454 if self.abbr_list and force == "end":
455 for abbr, definition in self.abbr_list.items():
456 self.out(" *[" + abbr + "]: " + definition + "\n")
460 self.lastWasNL = data and data[-1] == '\n'
463 def handle_data(self, data):
464 if r'\/script>' in data: self.quiet -= 1
467 def unknown_decl(self, data): pass
469 def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
471 def html2text_file(html, out=wrapwrite):
472 global options, args, oparser
473 if options is None or args is None:
474 (options, args) = oparser.parse_args(None, None)
482 return optwrap(html2text_file(html, None))
484 if __name__ == "__main__":
485 (options, args) = oparser.parse_args()
488 if arg.startswith('http://'):
489 j = urllib.urlopen(arg)
491 from feedparser import _getCharacterEncoding as enc
493 enc = lambda x, y: ('utf-8', 1)
495 encoding = enc(j.headers, text)[0]
496 if encoding == 'us-ascii': encoding = 'utf-8'
497 data = text.decode(encoding)
500 data = open(arg, 'r').read().decode(options.INPUT_ENCODING)
502 data = sys.stdin.read().decode(options.INPUT_ENCODING)
503 wrapwrite(html2text(data))