Add my pre-commit git script (with checkAPI/hf/encoding args...) Need to copy in...
[metze/wireshark/wip.git] / tools / indexcap.py
1 #!/usr/bin/python
2 #
3 # Tool to index protocols that appears in the given capture files
4 #
5 # The script list_protos_in_cap.sh does the same thing.
6 #
7 # Copyright 2009, Kovarththanan Rajaratnam <kovarththanan.rajaratnam@gmail.com>
8 #
9 # $Id$
10 #
11 # Wireshark - Network traffic analyzer
12 # By Gerald Combs <gerald@wireshark.org>
13 # Copyright 1998 Gerald Combs
14 #
15 # This program is free software; you can redistribute it and/or
16 # modify it under the terms of the GNU General Public License
17 # as published by the Free Software Foundation; either version 2
18 # of the License, or (at your option) any later version.
19 #
20 # This program is distributed in the hope that it will be useful,
21 # but WITHOUT ANY WARRANTY; without even the implied warranty of
22 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23 # GNU General Public License for more details.
24 #
25 # You should have received a copy of the GNU General Public License
26 # along with this program; if not, write to the Free Software
27 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 #
29
30 from optparse import OptionParser
31 import multiprocessing
32 import sys
33 import os
34 import subprocess
35 import re
36 import pickle
37 import tempfile
38 import filecmp
39 import random
40
41 def extract_protos_from_file_proces(tshark, file):
42     try:
43         cmd = [tshark, "-Tfields", "-e", "frame.protocols", "-r", file]
44         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
45         (stdout, stderr) = p.communicate()
46         if p.returncode != 0:
47             return (file, {})
48
49         proto_hash = {}
50         for line in stdout.splitlines():
51             if not re.match(r'^[\w:-]+$', line):
52                 continue
53
54             for proto in line.split(':'):
55                 proto_hash[proto] = 1 + proto_hash.setdefault(proto, 0)
56
57         return (file, proto_hash)
58     except KeyboardInterrupt:
59         return None
60
61 def extract_protos_from_file(tshark, num_procs, max_files, cap_files, cap_hash, index_file_name):
62     pool = multiprocessing.Pool(num_procs)
63     results = [pool.apply_async(extract_protos_from_file_proces, [tshark, file]) for file in cap_files]
64     try:
65         for (cur_item_idx,result_async) in enumerate(results):
66             file_result = result_async.get()
67             action = "SKIPPED" if file_result[1] is {} else "PROCESSED"
68             print "%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result[0], os.path.getsize(file_result[0]))
69             cap_hash.update(dict([file_result]))
70     except KeyboardInterrupt:
71         print "%s was interrupted by user" % (sys.argv[0])
72         pool.terminate()
73         exit(1)
74
75     index_file = open(index_file_name, "w")
76     pickle.dump(cap_hash, index_file)
77     index_file.close()
78     exit(0)
79
80 def dissect_file_process(tshark, tmpdir, file):
81     try:
82         (handle_o, tmpfile_o) = tempfile.mkstemp(suffix='_stdout', dir=tmpdir)
83         (handle_e, tmpfile_e) = tempfile.mkstemp(suffix='_stderr', dir=tmpdir)
84         cmd = [tshark, "-nxVr", file]
85         p = subprocess.Popen(cmd, stdout=handle_o, stderr=handle_e)
86         (stdout, stderr) = p.communicate()
87         if p.returncode == 0:
88             return (file, True, tmpfile_o, tmpfile_e)
89         else:
90             return (file, False, tmpfile_o, tmpfile_e)
91
92     except KeyboardInterrupt:
93         return False
94
95     finally:
96         os.close(handle_o)
97         os.close(handle_e)
98
99 def dissect_files(tshark, tmpdir, num_procs, max_files, cap_files):
100     pool = multiprocessing.Pool(num_procs)
101     results = [pool.apply_async(dissect_file_process, [tshark, tmpdir, file]) for file in cap_files]
102     try:
103         for (cur_item_idx,result_async) in enumerate(results):
104             file_result = result_async.get()
105             action = "FAILED" if file_result[1] is False else "PASSED"
106             print "%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result[0], os.path.getsize(file_result[0]))
107     except KeyboardInterrupt:
108         print "%s was interrupted by user" % (sys.argv[0])
109         pool.terminate()
110         exit(1)
111
112 def compare_files(tshark_bin, tmpdir, tshark_cmp, num_procs, max_files, cap_files):
113     pool = multiprocessing.Pool(num_procs)
114     results_bin = [pool.apply_async(dissect_file_process, [tshark_bin, tmpdir, file]) for file in cap_files]
115     results_cmp = [pool.apply_async(dissect_file_process, [tshark_cmp, tmpdir, file]) for file in cap_files]
116     try:
117         for (cur_item_idx,(result_async_bin, result_async_cmp)) in enumerate(zip(results_bin, results_cmp)):
118             file_result_bin = result_async_bin.get()
119             file_result_cmp = result_async_cmp.get()
120             if file_result_cmp[1] is False or file_result_bin[1] is False:
121                 action = "FAILED (exitcode)"
122             if not filecmp.cmp(file_result_bin[2], file_result_cmp[2]):
123                 action = "FAILED (stdout)"
124             if not filecmp.cmp(file_result_bin[3], file_result_cmp[3]):
125                 action = "FAILED (stderr)"
126             else:
127                 action = "PASSED"
128                 os.remove(file_result_bin[2])
129                 os.remove(file_result_cmp[2])
130                 os.remove(file_result_bin[3])
131                 os.remove(file_result_cmp[3])
132
133             print "%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result_bin[0], os.path.getsize(file_result_bin[0]))
134             print "%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result_cmp[0], os.path.getsize(file_result_cmp[0]))
135     except KeyboardInterrupt:
136         print "%s was interrupted by user" % (sys.argv[0])
137         pool.terminate()
138         exit(1)
139
140 def list_all_proto(cap_hash):
141     proto_hash = {}
142     for files_hash in cap_hash.itervalues():
143         for proto,count in files_hash.iteritems():
144             proto_hash[proto] = count + proto_hash.setdefault(proto, 0)
145
146     return proto_hash
147
148 def list_all_files(cap_hash):
149     files = cap_hash.keys()
150     files.sort()
151
152     return files
153
154 def list_all_proto_files(cap_hash, proto_comma_delit):
155     protos = [ x.strip() for x in proto_comma_delit.split(',') ]
156     files = []
157     for (file, files_hash) in cap_hash.iteritems():
158         for proto in files_hash.iterkeys():
159             if proto in protos:
160                 files.append(file)
161                 break
162
163     return files
164
165 def index_file_action(options):
166     return options.list_all_proto or \
167            options.list_all_files or \
168            options.list_all_proto_files or \
169            options.dissect_files
170
171 def find_capture_files(paths, cap_hash):
172     cap_files = []
173     for path in paths:
174         if os.path.isdir(path):
175             path = os.path.normpath(path)
176             for root, dirs, files in os.walk(path):
177                 cap_files += [os.path.join(root, name) for name in files if os.path.join(root, name) not in cap_hash]
178         elif path not in cap_hash:
179             cap_files.append(path)
180     return cap_files
181
182 def find_tshark_executable(bin_dir):
183     for file in ["tshark.exe", "tshark"]:
184         tshark = os.path.join(bin_dir, file)
185         if os.access(tshark, os.X_OK):
186             return tshark
187
188     return None
189
190 def main():
191     parser = OptionParser(usage="usage: %prog [options] index_file [file_1|dir_1 [.. file_n|dir_n]]")
192     parser.add_option("-d", "--dissect-files", dest="dissect_files", default=False, action="store_true",
193                       help="Dissect all matching files")
194     parser.add_option("-m", "--max-files", dest="max_files", default=sys.maxint, type="int",
195                       help="Max number of files to process")
196     parser.add_option("-b", "--binary-dir", dest="bin_dir", default=os.getcwd(),
197                       help="Directory containing tshark executable")
198     parser.add_option("-c", "--compare-dir", dest="compare_dir", default=None,
199                       help="Directory containing tshark executable which is used for comparison")
200     parser.add_option("-j", dest="num_procs", default=multiprocessing.cpu_count(), type=int,
201                       help="Max number of processes to spawn")
202     parser.add_option("-r", "--randomize", default=False, action="store_true",
203                       help="Randomize the file list order")
204     parser.add_option("", "--list-all-proto", dest="list_all_proto", default=False, action="store_true",
205                       help="List all protocols in index file")
206     parser.add_option("", "--list-all-files", dest="list_all_files", default=False, action="store_true",
207                       help="List all files in index file")
208     parser.add_option("", "--list-all-proto-files", dest="list_all_proto_files", default=False,
209                       metavar="PROTO_1[, .. PROTO_N]",
210                       help="List all files in index file containing the given protocol")
211
212     (options, args) = parser.parse_args()
213
214     if len(args) == 0:
215         parser.error("index_file is a required argument")
216
217     if len(args) == 1 and not index_file_action(options):
218         parser.error("one capture file/directory must be specified")
219
220     if options.dissect_files and not options.list_all_files and not options.list_all_proto_files:
221         parser.error("--list-all-files or --list-all-proto-files must be specified")
222
223     if options.dissect_files and not options.compare_dir is None:
224         parser.error("--dissect-files and --compare-dir cannot be specified at the same time")
225
226     index_file_name = args.pop(0)
227     paths = args
228     cap_hash = {}
229     try:
230         index_file = open(index_file_name, "r")
231         print "index file:", index_file.name, "[OPENED]",
232         cap_hash = pickle.load(index_file)
233         index_file.close()
234         print len(cap_hash), "files"
235     except IOError:
236         print "index file:", index_file_name, "[NEW]"
237
238     if options.list_all_proto:
239         print list_all_proto(cap_hash)
240         exit(0)
241
242     indexed_files = []
243     if options.list_all_files:
244         indexed_files = list_all_files(cap_hash)
245         print indexed_files
246
247     if options.list_all_proto_files:
248         indexed_files = list_all_proto_files(cap_hash, options.list_all_proto_files)
249         print indexed_files
250
251     tshark_bin = find_tshark_executable(options.bin_dir)
252     if not tshark_bin is None:
253         print "tshark:", tshark_bin, "[FOUND]"
254     else:
255         print "tshark:", tshark_bin, "[MISSING]"
256         exit(1)
257
258     if not options.compare_dir is None:
259         tshark_cmp = find_tshark_executable(options.compare_dir)
260         if not tshark_cmp is None:
261             print "tshark:", tshark_cmp, "[FOUND]"
262         else:
263             print "tshark:", tshark_cmp, "[MISSING]"
264             exit(1)
265
266     if options.dissect_files or options.compare_dir:
267         cap_files = indexed_files
268     elif options.list_all_proto_files or options.list_all_files:
269         exit(0)
270     else:
271         cap_files = find_capture_files(paths, cap_hash)
272
273     if options.randomize:
274         random.shuffle(cap_files)
275     else:
276         cap_files.sort()
277
278     options.max_files = min(options.max_files, len(cap_files))
279     print "%u total files, %u working files" % (len(cap_files), options.max_files)
280     cap_files = cap_files[:options.max_files]
281     tmpdir = tempfile.mkdtemp()
282     print "Temporary working dir: %s" % tmpdir
283
284     if options.compare_dir:
285         compare_files(tshark_bin, tmpdir, tshark_cmp, options.num_procs, options.max_files, cap_files)
286     elif options.dissect_files:
287         dissect_files(tshark_bin, tmpdir, options.num_procs, options.max_files, cap_files)
288     else:
289         extract_protos_from_file(tshark_bin, tmpdir, options.num_procs, options.max_files, cap_files, cap_hash, index_file_name)
290
291     os.rmdir(tmpdir)
292 if __name__ == "__main__":
293     main()