tools/html2text.py

   1 #!/usr/bin/env python
   2 #
   3 # html2text.py - converts HTML to text
   4 #
   5 # Wireshark - Network traffic analyzer
   6 # By Gerald Combs <gerald@wireshark.org>
   7 # Copyright 1998 Gerald Combs
   8 #
   9 # This program is free software; you can redistribute it and/or
  10 # modify it under the terms of the GNU General Public License
  11 # as published by the Free Software Foundation; either version 2
  12 # of the License, or (at your option) any later version.
  13 #
  14 # This program is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18 #
  19 # You should have received a copy of the GNU General Public License
  20 # along with this program; if not, write to the Free Software
  21 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  22
  23 __author__      = "Peter Wu <peter@lekensteyn.nl>"
  24 __copyright__   = "Copyright 2015, Peter Wu"
  25 __license__     = "GPL (v2 or later)"
  26
  27 # TODO:
  28 #   multiple list indentation levels
  29 #   maybe allow for ascii output instead of utf-8?
  30
  31 import sys
  32 from textwrap import TextWrapper
  33 try:
  34     from HTMLParser import HTMLParser
  35     from htmlentitydefs import name2codepoint
  36 except: # Python 3
  37     from html.parser import HTMLParser
  38     from html.entities import name2codepoint
  39     unichr = chr # for html entity handling
  40
  41 class TextHTMLParser(HTMLParser):
  42     """Converts a HTML document to text."""
  43     def __init__(self):
  44         try:
  45             # Python 3.4
  46             HTMLParser. __init__(self, convert_charrefs=True)
  47         except:
  48             HTMLParser. __init__(self)
  49         # All text, concatenated
  50         self.output_buffer = ''
  51         # The current text block which is being constructed
  52         self.text_block = ''
  53         # Whether the previous element was terminated with whitespace
  54         self.need_space = False
  55         # Whether to prevent word-wrapping the contents (for "pre" tag)
  56         self.skip_wrap = False
  57         # track list items
  58         self.list_item_prefix = None
  59         self.ordered_list_index = None
  60         # Indentation (for heading and paragraphs)
  61         self.indent_levels = [0, 0]
  62
  63     def _wrap_text(self, text):
  64         """Wraps text, but additionally indent list items."""
  65         initial_indent = indent = sum(self.indent_levels) * ' '
  66         if self.list_item_prefix:
  67             initial_indent += self.list_item_prefix
  68             indent += '    '
  69         kwargs = {
  70             'width': 66,
  71             'initial_indent': initial_indent,
  72             'subsequent_indent': indent
  73         }
  74         if sys.version_info[0:2] >= (2, 6):
  75             kwargs['break_on_hyphens'] = False
  76         wrapper = TextWrapper(**kwargs)
  77         return '\n'.join(wrapper.wrap(text))
  78
  79     def _commit_block(self, newline='\n\n'):
  80         text = self.text_block
  81         if text:
  82             if not self.skip_wrap:
  83                 text = self._wrap_text(text)
  84             self.output_buffer += text + newline
  85             self.text_block = ''
  86         self.need_space = False
  87
  88     def handle_starttag(self, tag, attrs):
  89         # end a block of text on <br>, but also flush list items which are not
  90         # terminated.
  91         if tag == 'br' or tag == 'li':
  92             self._commit_block('\n')
  93         if tag == 'pre':
  94             self.skip_wrap = True
  95         # Following list items are numbered.
  96         if tag == 'ol':
  97             self.ordered_list_index = 1
  98         if tag == 'ul':
  99             self.list_item_prefix = '  * '
 100         if tag == 'li' and self.ordered_list_index:
 101             self.list_item_prefix =  ' %d. ' % (self.ordered_list_index)
 102             self.ordered_list_index += 1
 103         if tag[0] == 'h' and len(tag) == 2 and \
 104             (tag[1] >= '1' and tag[1] <= '6'):
 105             self.indent_levels = [int(tag[1]) - 1, 0]
 106         if tag == 'p':
 107             self.indent_levels[1] = 1
 108
 109     def handle_data(self, data):
 110         if self.skip_wrap:
 111             block = data
 112         else:
 113             # For normal text, fold multiple whitespace and strip
 114             # leading and trailing spaces for the whole block (but
 115             # keep spaces in the middle).
 116             block = ''
 117             if data.strip() and data[:1].isspace():
 118                 # Keep spaces in the middle
 119                 self.need_space = True
 120             if self.need_space and data.strip() and self.text_block:
 121                 block = ' '
 122             block += ' '.join(data.split())
 123             self.need_space = data[-1:].isspace()
 124         self.text_block += block
 125
 126     def handle_endtag(self, tag):
 127         block_elements = 'p li ul pre ol h1 h2 h3 h4 h5 h6'
 128         #block_elements += ' dl dd dt'
 129         if tag in block_elements.split():
 130             self._commit_block()
 131         if tag in ('ol', 'ul'):
 132             self.list_item_prefix = None
 133             self.ordered_list_index = None
 134         if tag == 'pre':
 135             self.skip_wrap = False
 136
 137     def handle_charref(self, name):
 138         self.handle_data(unichr(int(name)))
 139
 140     def handle_entityref(self, name):
 141         self.handle_data(unichr(name2codepoint[name]))
 142
 143     def close(self):
 144         HTMLParser.close(self)
 145         self._commit_block()
 146         byte_output = self.output_buffer.encode('utf-8')
 147         if hasattr(sys.stdout, 'buffer'):
 148             sys.stdout.buffer.write(byte_output)
 149         else:
 150             sys.stdout.write(byte_output)
 151
 152
 153 def main():
 154     htmlparser = TextHTMLParser()
 155     if len(sys.argv) > 1 and sys.argv[1] != '-':
 156         filename = sys.argv[1]
 157         f = open(filename, 'rb')
 158     else:
 159         filename = None
 160         f = sys.stdin
 161     try:
 162         if hasattr(f, 'buffer'):
 163             # Access raw (byte) buffer in Python 3 instead of decoded one
 164             f = f.buffer
 165         # Read stdin as as Unicode string
 166         htmlparser.feed(f.read().decode('utf-8'))
 167     finally:
 168         if filename is not None:
 169             f.close()
 170     htmlparser.close()
 171
 172 if __name__ == '__main__':
 173     sys.exit(main())