3 # html2text.py - converts HTML to text
5 # Wireshark - Network traffic analyzer
6 # By Gerald Combs <gerald@wireshark.org>
7 # Copyright 1998 Gerald Combs
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 2
12 # of the License, or (at your option) any later version.
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 from __future__ import unicode_literals
25 __author__ = "Peter Wu <peter@lekensteyn.nl>"
26 __copyright__ = "Copyright 2015, Peter Wu"
27 __license__ = "GPL (v2 or later)"
30 # multiple list indentation levels
31 # maybe allow for ascii output instead of utf-8?
34 from textwrap import TextWrapper
36 from HTMLParser import HTMLParser
37 from htmlentitydefs import name2codepoint
39 from html.parser import HTMLParser
40 from html.entities import name2codepoint
41 unichr = chr # for html entity handling
43 class TextHTMLParser(HTMLParser):
44 """Converts a HTML document to text."""
48 HTMLParser. __init__(self, convert_charrefs=True)
50 HTMLParser. __init__(self)
51 # All text, concatenated
52 self.output_buffer = ''
53 # The current text block which is being constructed
55 # Whether the previous element was terminated with whitespace
56 self.need_space = False
57 # Whether to prevent word-wrapping the contents (for "pre" tag)
58 self.skip_wrap = False
60 self.list_item_prefix = None
61 self.ordered_list_index = None
62 # Indentation (for heading and paragraphs)
63 self.indent_levels = [0, 0]
65 def _wrap_text(self, text):
66 """Wraps text, but additionally indent list items."""
67 initial_indent = indent = sum(self.indent_levels) * ' '
68 if self.list_item_prefix:
69 initial_indent += self.list_item_prefix
71 wrapper = TextWrapper(width=66, break_on_hyphens=False,
72 initial_indent=initial_indent, subsequent_indent=indent)
73 return '\n'.join(wrapper.wrap(text))
75 def _commit_block(self, newline='\n\n'):
76 text = self.text_block
78 if not self.skip_wrap:
79 text = self._wrap_text(text)
80 self.output_buffer += text + newline
82 self.need_space = False
84 def handle_starttag(self, tag, attrs):
85 # end a block of text on <br>, but also flush list items which are not
87 if tag == 'br' or tag == 'li':
88 self._commit_block('\n')
91 # Following list items are numbered.
93 self.ordered_list_index = 1
95 self.list_item_prefix = ' * '
96 if tag == 'li' and self.ordered_list_index:
97 self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
98 self.ordered_list_index += 1
99 if tag[0] == 'h' and len(tag) == 2 and \
100 (tag[1] >= '1' and tag[1] <= '6'):
101 self.indent_levels = [int(tag[1]) - 1, 0]
103 self.indent_levels[1] = 1
105 def handle_data(self, data):
109 # For normal text, fold multiple whitespace and strip
110 # leading and trailing spaces for the whole block (but
111 # keep spaces in the middle).
113 if data.strip() and data[:1].isspace():
114 # Keep spaces in the middle
115 self.need_space = True
116 if self.need_space and data.strip() and self.text_block:
118 block += ' '.join(data.split())
119 self.need_space = data[-1:].isspace()
120 self.text_block += block
122 def handle_endtag(self, tag):
123 block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
124 #block_elements += ' dl dd dt'
125 if tag in block_elements.split():
127 if tag in ('ol', 'ul'):
128 self.list_item_prefix = None
129 self.ordered_list_index = None
131 self.skip_wrap = False
133 def handle_charref(self, name):
134 self.handle_data(unichr(int(name)))
136 def handle_entityref(self, name):
137 self.handle_data(unichr(name2codepoint[name]))
140 HTMLParser.close(self)
142 byte_output = self.output_buffer.encode('utf-8')
143 if hasattr(sys.stdout, 'buffer'):
144 sys.stdout.buffer.write(byte_output)
146 sys.stdout.write(byte_output)
150 htmlparser = TextHTMLParser()
151 if len(sys.argv) > 1:
152 if sys.version_info[0] >= 3:
153 # Python 3: read file as utf-8
154 kwargs = { 'encoding': 'utf-8' }
157 with open(sys.argv[1], **kwargs) as f:
159 htmlparser.feed(line)
162 if hasattr(f, 'buffer'):
163 # Access raw (byte) buffer in Python 3 instead of decoded one
165 # Read stdin as as Unicode string
166 htmlparser.feed(f.read().decode('utf-8'))
169 if __name__ == '__main__':