tools/html2text.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # html2text.py - converts HTML to text
   5 #
   6 # Wireshark - Network traffic analyzer
   7 # By Gerald Combs <gerald@wireshark.org>
   8 # Copyright 1998 Gerald Combs
   9 #
  10 # SPDX-License-Identifier: GPL-2.0-or-later
  11
  12 from __future__ import unicode_literals
  13
  14 __author__      = "Peter Wu <peter@lekensteyn.nl>"
  15 __copyright__   = "Copyright 2015, Peter Wu"
  16 __license__     = "GPL (v2 or later)"
  17
  18 # TODO:
  19 #   multiple list indentation levels
  20 #   maybe allow for ascii output instead of utf-8?
  21
  22 import sys
  23 from textwrap import TextWrapper
  24 try:
  25     from HTMLParser import HTMLParser
  26     from htmlentitydefs import name2codepoint
  27 except: # Python 3
  28     from html.parser import HTMLParser
  29     from html.entities import name2codepoint
  30     unichr = chr # for html entity handling
  31
  32 class TextHTMLParser(HTMLParser):
  33     """Converts a HTML document to text."""
  34     def __init__(self):
  35         try:
  36             # Python 3.4
  37             HTMLParser. __init__(self, convert_charrefs=True)
  38         except:
  39             HTMLParser. __init__(self)
  40         # All text, concatenated
  41         self.output_buffer = ''
  42         # The current text block which is being constructed
  43         self.text_block = ''
  44         # Whether the previous element was terminated with whitespace
  45         self.need_space = False
  46         # Whether to prevent word-wrapping the contents (for "pre" tag)
  47         self.skip_wrap = False
  48         # track list items
  49         self.list_item_prefix = None
  50         self.ordered_list_index = None
  51         # Indentation (for heading and paragraphs)
  52         self.indent_levels = [0, 0]
  53         # Don't dump CSS, scripts, etc.
  54         self.ignore_tags = ('head', 'style', 'script')
  55         self.ignore_level = 0
  56         # href footnotes.
  57         self.footnotes = []
  58         self.href = None
  59
  60     def _wrap_text(self, text):
  61         """Wraps text, but additionally indent list items."""
  62         initial_indent = indent = sum(self.indent_levels) * ' '
  63         if self.list_item_prefix:
  64             initial_indent += self.list_item_prefix
  65             indent += '    '
  66         kwargs = {
  67             'width': 66,
  68             'initial_indent': initial_indent,
  69             'subsequent_indent': indent
  70         }
  71         if sys.version_info[0:2] >= (2, 6):
  72             kwargs['break_on_hyphens'] = False
  73         wrapper = TextWrapper(**kwargs)
  74         return '\n'.join(wrapper.wrap(text))
  75
  76     def _commit_block(self, newline='\n\n'):
  77         text = self.text_block
  78         if text:
  79             if not self.skip_wrap:
  80                 text = self._wrap_text(text)
  81             self.output_buffer += text + newline
  82             self.text_block = ''
  83         self.need_space = False
  84
  85     def handle_starttag(self, tag, attrs):
  86         # end a block of text on <br>, but also flush list items which are not
  87         # terminated.
  88         if tag == 'br' or tag == 'li':
  89             self._commit_block('\n')
  90         if tag == 'pre':
  91             self.skip_wrap = True
  92         # Following list items are numbered.
  93         if tag == 'ol':
  94             self.ordered_list_index = 1
  95         if tag == 'ul':
  96             self.list_item_prefix = '  • '
  97         if tag == 'li' and self.ordered_list_index:
  98             self.list_item_prefix =  ' %d. ' % (self.ordered_list_index)
  99             self.ordered_list_index += 1
 100         if tag[0] == 'h' and len(tag) == 2 and \
 101             (tag[1] >= '1' and tag[1] <= '6'):
 102             self.indent_levels = [int(tag[1]) - 1, 0]
 103         if tag == 'p':
 104             self.indent_levels[1] = 1
 105         if tag == 'a':
 106             try:
 107                 href = [attr[1] for attr in attrs if attr[0] == 'href'][0]
 108                 if '://' in href: # Skip relative URLs and links.
 109                     self.href = href
 110             except IndexError:
 111                 self.href = None
 112         if tag in self.ignore_tags:
 113             self.ignore_level += 1
 114
 115     def handle_data(self, data):
 116         if self.ignore_level > 0:
 117             return
 118         elif self.skip_wrap:
 119             block = data
 120         else:
 121             # For normal text, fold multiple whitespace and strip
 122             # leading and trailing spaces for the whole block (but
 123             # keep spaces in the middle).
 124             block = ''
 125             if data.strip() and data[:1].isspace():
 126                 # Keep spaces in the middle
 127                 self.need_space = True
 128             if self.need_space and data.strip() and self.text_block:
 129                 block = ' '
 130             block += ' '.join(data.split())
 131             self.need_space = data[-1:].isspace()
 132         self.text_block += block
 133
 134     def handle_endtag(self, tag):
 135         block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
 136         #block_elements += ' dl dd dt'
 137         if tag in block_elements.split():
 138             self._commit_block()
 139         if tag in ('ol', 'ul'):
 140             self.list_item_prefix = None
 141             self.ordered_list_index = None
 142         if tag == 'pre':
 143             self.skip_wrap = False
 144         if tag == 'a' and self.href:
 145             self.footnotes.append(self.href)
 146             self.text_block += '[{}]'.format(len(self.footnotes))
 147         if tag in self.ignore_tags:
 148             self.ignore_level -= 1
 149
 150     def handle_charref(self, name):
 151         self.handle_data(unichr(int(name)))
 152
 153     def handle_entityref(self, name):
 154         self.handle_data(unichr(name2codepoint[name]))
 155
 156     def close(self):
 157         HTMLParser.close(self)
 158         self._commit_block()
 159
 160         if len(self.footnotes) > 0:
 161             self.list_item_prefix = None
 162             self.indent_levels = [1, 0]
 163             self.text_block = 'References'
 164             self._commit_block()
 165             self.indent_levels = [1, 1]
 166             footnote_num = 1
 167             for href in self.footnotes:
 168                 self.text_block += '{:>2}. {}\n'.format(footnote_num, href)
 169                 footnote_num += 1
 170                 self._commit_block('\n')
 171
 172
 173         byte_output = self.output_buffer.encode('utf-8')
 174         if hasattr(sys.stdout, 'buffer'):
 175             sys.stdout.buffer.write(byte_output)
 176         else:
 177             sys.stdout.write(byte_output)
 178
 179
 180 def main():
 181     htmlparser = TextHTMLParser()
 182     if len(sys.argv) > 1 and sys.argv[1] != '-':
 183         filename = sys.argv[1]
 184         f = open(filename, 'rb')
 185     else:
 186         filename = None
 187         f = sys.stdin
 188     try:
 189         if hasattr(f, 'buffer'):
 190             # Access raw (byte) buffer in Python 3 instead of decoded one
 191             f = f.buffer
 192         # Read stdin as as Unicode string
 193         htmlparser.feed(f.read().decode('utf-8'))
 194     finally:
 195         if filename is not None:
 196             f.close()
 197     htmlparser.close()
 198
 199 if __name__ == '__main__':
 200     sys.exit(main())