2 # -*- coding: utf-8 -*-
4 # html2text.py - converts HTML to text
6 # Wireshark - Network traffic analyzer
7 # By Gerald Combs <gerald@wireshark.org>
8 # Copyright 1998 Gerald Combs
10 # SPDX-License-Identifier: GPL-2.0-or-later
12 from __future__ import unicode_literals
14 __author__ = "Peter Wu <peter@lekensteyn.nl>"
15 __copyright__ = "Copyright 2015, Peter Wu"
16 __license__ = "GPL (v2 or later)"
19 # multiple list indentation levels
20 # maybe allow for ascii output instead of utf-8?
23 from textwrap import TextWrapper
25 from HTMLParser import HTMLParser
26 from htmlentitydefs import name2codepoint
28 from html.parser import HTMLParser
29 from html.entities import name2codepoint
30 unichr = chr # for html entity handling
32 class TextHTMLParser(HTMLParser):
33 """Converts a HTML document to text."""
37 HTMLParser. __init__(self, convert_charrefs=True)
39 HTMLParser. __init__(self)
40 # All text, concatenated
41 self.output_buffer = ''
42 # The current text block which is being constructed
44 # Whether the previous element was terminated with whitespace
45 self.need_space = False
46 # Whether to prevent word-wrapping the contents (for "pre" tag)
47 self.skip_wrap = False
49 self.list_item_prefix = None
50 self.ordered_list_index = None
51 # Indentation (for heading and paragraphs)
52 self.indent_levels = [0, 0]
53 # Don't dump CSS, scripts, etc.
54 self.ignore_tags = ('head', 'style', 'script')
60 def _wrap_text(self, text):
61 """Wraps text, but additionally indent list items."""
62 initial_indent = indent = sum(self.indent_levels) * ' '
63 if self.list_item_prefix:
64 initial_indent += self.list_item_prefix
68 'initial_indent': initial_indent,
69 'subsequent_indent': indent
71 if sys.version_info[0:2] >= (2, 6):
72 kwargs['break_on_hyphens'] = False
73 wrapper = TextWrapper(**kwargs)
74 return '\n'.join(wrapper.wrap(text))
76 def _commit_block(self, newline='\n\n'):
77 text = self.text_block
79 if not self.skip_wrap:
80 text = self._wrap_text(text)
81 self.output_buffer += text + newline
83 self.need_space = False
85 def handle_starttag(self, tag, attrs):
86 # end a block of text on <br>, but also flush list items which are not
88 if tag == 'br' or tag == 'li':
89 self._commit_block('\n')
92 # Following list items are numbered.
94 self.ordered_list_index = 1
96 self.list_item_prefix = ' • '
97 if tag == 'li' and self.ordered_list_index:
98 self.list_item_prefix = ' %d. ' % (self.ordered_list_index)
99 self.ordered_list_index += 1
100 if tag[0] == 'h' and len(tag) == 2 and \
101 (tag[1] >= '1' and tag[1] <= '6'):
102 self.indent_levels = [int(tag[1]) - 1, 0]
104 self.indent_levels[1] = 1
107 href = [attr[1] for attr in attrs if attr[0] == 'href'][0]
108 if '://' in href: # Skip relative URLs and links.
112 if tag in self.ignore_tags:
113 self.ignore_level += 1
115 def handle_data(self, data):
116 if self.ignore_level > 0:
121 # For normal text, fold multiple whitespace and strip
122 # leading and trailing spaces for the whole block (but
123 # keep spaces in the middle).
125 if data.strip() and data[:1].isspace():
126 # Keep spaces in the middle
127 self.need_space = True
128 if self.need_space and data.strip() and self.text_block:
130 block += ' '.join(data.split())
131 self.need_space = data[-1:].isspace()
132 self.text_block += block
134 def handle_endtag(self, tag):
135 block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
136 #block_elements += ' dl dd dt'
137 if tag in block_elements.split():
139 if tag in ('ol', 'ul'):
140 self.list_item_prefix = None
141 self.ordered_list_index = None
143 self.skip_wrap = False
144 if tag == 'a' and self.href:
145 self.footnotes.append(self.href)
146 self.text_block += '[{}]'.format(len(self.footnotes))
147 if tag in self.ignore_tags:
148 self.ignore_level -= 1
150 def handle_charref(self, name):
151 self.handle_data(unichr(int(name)))
153 def handle_entityref(self, name):
154 self.handle_data(unichr(name2codepoint[name]))
157 HTMLParser.close(self)
160 if len(self.footnotes) > 0:
161 self.list_item_prefix = None
162 self.indent_levels = [1, 0]
163 self.text_block = 'References'
165 self.indent_levels = [1, 1]
167 for href in self.footnotes:
168 self.text_block += '{:>2}. {}\n'.format(footnote_num, href)
170 self._commit_block('\n')
173 byte_output = self.output_buffer.encode('utf-8')
174 if hasattr(sys.stdout, 'buffer'):
175 sys.stdout.buffer.write(byte_output)
177 sys.stdout.write(byte_output)
181 htmlparser = TextHTMLParser()
182 if len(sys.argv) > 1 and sys.argv[1] != '-':
183 filename = sys.argv[1]
184 f = open(filename, 'rb')
189 if hasattr(f, 'buffer'):
190 # Access raw (byte) buffer in Python 3 instead of decoded one
192 # Read stdin as as Unicode string
193 htmlparser.feed(f.read().decode('utf-8'))
195 if filename is not None:
199 if __name__ == '__main__':