tests: add regression tests for Follow TCP Stream
[metze/wireshark/wip.git] / tools / indexcap.py
1 #!/usr/bin/env python
2 #
3 # Tool to index protocols that appears in the given capture files
4 #
5 # The script list_protos_in_cap.sh does the same thing.
6 #
7 # Copyright 2009, Kovarththanan Rajaratnam <kovarththanan.rajaratnam@gmail.com>
8 #
9 # Wireshark - Network traffic analyzer
10 # By Gerald Combs <gerald@wireshark.org>
11 # Copyright 1998 Gerald Combs
12 #
13 # SPDX-License-Identifier: GPL-2.0-or-later
14 #
15
16 from optparse import OptionParser
17 import multiprocessing
18 import sys
19 import os
20 import subprocess
21 import re
22 import pickle
23 import tempfile
24 import filecmp
25 import random
26
27 def extract_protos_from_file_proces(tshark, file):
28     try:
29         cmd = [tshark, "-Tfields", "-e", "frame.protocols", "-r", file]
30         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
31         (stdout, stderr) = p.communicate()
32         if sys.version_info[0] >= 3:
33             stdout = stdout.decode('utf-8')
34         if p.returncode != 0:
35             return (file, {})
36
37         proto_hash = {}
38         for line in stdout.splitlines():
39             if not re.match(r'^[\w:-]+$', line):
40                 continue
41
42             for proto in line.split(':'):
43                 proto_hash[proto] = 1 + proto_hash.setdefault(proto, 0)
44
45         return (file, proto_hash)
46     except KeyboardInterrupt:
47         return None
48
49 def extract_protos_from_file(tshark, num_procs, max_files, cap_files, cap_hash, index_file_name):
50     pool = multiprocessing.Pool(num_procs)
51     results = [pool.apply_async(extract_protos_from_file_proces, [tshark, file]) for file in cap_files]
52     try:
53         for (cur_item_idx,result_async) in enumerate(results):
54             file_result = result_async.get()
55             action = "SKIPPED" if file_result[1] is {} else "PROCESSED"
56             print("%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result[0], os.path.getsize(file_result[0])))
57             cap_hash.update(dict([file_result]))
58     except KeyboardInterrupt:
59         print("%s was interrupted by user" % (sys.argv[0]))
60         pool.terminate()
61         exit(1)
62
63     index_file = open(index_file_name, "wb")
64     pickle.dump(cap_hash, index_file)
65     index_file.close()
66     exit(0)
67
68 def dissect_file_process(tshark, tmpdir, file):
69     try:
70         (handle_o, tmpfile_o) = tempfile.mkstemp(suffix='_stdout', dir=tmpdir)
71         (handle_e, tmpfile_e) = tempfile.mkstemp(suffix='_stderr', dir=tmpdir)
72         cmd = [tshark, "-nxVr", file]
73         p = subprocess.Popen(cmd, stdout=handle_o, stderr=handle_e)
74         (stdout, stderr) = p.communicate()
75         if p.returncode == 0:
76             return (file, True, tmpfile_o, tmpfile_e)
77         else:
78             return (file, False, tmpfile_o, tmpfile_e)
79
80     except KeyboardInterrupt:
81         return False
82
83     finally:
84         os.close(handle_o)
85         os.close(handle_e)
86
87 def dissect_files(tshark, tmpdir, num_procs, max_files, cap_files):
88     pool = multiprocessing.Pool(num_procs)
89     results = [pool.apply_async(dissect_file_process, [tshark, tmpdir, file]) for file in cap_files]
90     try:
91         for (cur_item_idx,result_async) in enumerate(results):
92             file_result = result_async.get()
93             action = "FAILED" if file_result[1] is False else "PASSED"
94             print("%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result[0], os.path.getsize(file_result[0])))
95     except KeyboardInterrupt:
96         print("%s was interrupted by user" % (sys.argv[0]))
97         pool.terminate()
98         exit(1)
99
100 def compare_files(tshark_bin, tmpdir, tshark_cmp, num_procs, max_files, cap_files):
101     pool = multiprocessing.Pool(num_procs)
102     results_bin = [pool.apply_async(dissect_file_process, [tshark_bin, tmpdir, file]) for file in cap_files]
103     results_cmp = [pool.apply_async(dissect_file_process, [tshark_cmp, tmpdir, file]) for file in cap_files]
104     try:
105         for (cur_item_idx,(result_async_bin, result_async_cmp)) in enumerate(zip(results_bin, results_cmp)):
106             file_result_bin = result_async_bin.get()
107             file_result_cmp = result_async_cmp.get()
108             if file_result_cmp[1] is False or file_result_bin[1] is False:
109                 action = "FAILED (exitcode)"
110             if not filecmp.cmp(file_result_bin[2], file_result_cmp[2]):
111                 action = "FAILED (stdout)"
112             if not filecmp.cmp(file_result_bin[3], file_result_cmp[3]):
113                 action = "FAILED (stderr)"
114             else:
115                 action = "PASSED"
116                 os.remove(file_result_bin[2])
117                 os.remove(file_result_cmp[2])
118                 os.remove(file_result_bin[3])
119                 os.remove(file_result_cmp[3])
120
121             print("%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result_bin[0], os.path.getsize(file_result_bin[0])))
122             print("%s [%u/%u] %s %u bytes" % (action, cur_item_idx+1, max_files, file_result_cmp[0], os.path.getsize(file_result_cmp[0])))
123     except KeyboardInterrupt:
124         print("%s was interrupted by user" % (sys.argv[0]))
125         pool.terminate()
126         exit(1)
127
128 def list_all_proto(cap_hash):
129     proto_hash = {}
130     for files_hash in cap_hash.values():
131         for proto,count in files_hash.items():
132             proto_hash[proto] = count + proto_hash.setdefault(proto, 0)
133
134     return proto_hash
135
136 def list_all_files(cap_hash):
137     files = list(cap_hash.keys())
138     files.sort()
139
140     return files
141
142 def list_all_proto_files(cap_hash, proto_comma_delit):
143     protos = [ x.strip() for x in proto_comma_delit.split(',') ]
144     files = []
145     for (file, files_hash) in cap_hash.items():
146         for proto in files_hash.keys():
147             if proto in protos:
148                 files.append(file)
149                 break
150
151     return files
152
153 def index_file_action(options):
154     return options.list_all_proto or \
155            options.list_all_files or \
156            options.list_all_proto_files or \
157            options.dissect_files
158
159 def find_capture_files(paths, cap_hash):
160     cap_files = []
161     for path in paths:
162         if os.path.isdir(path):
163             path = os.path.normpath(path)
164             for root, dirs, files in os.walk(path):
165                 cap_files += [os.path.join(root, name) for name in files if os.path.join(root, name) not in cap_hash]
166         elif path not in cap_hash:
167             cap_files.append(path)
168     return cap_files
169
170 def find_tshark_executable(bin_dir):
171     for file in ["tshark.exe", "tshark"]:
172         tshark = os.path.join(bin_dir, file)
173         if os.access(tshark, os.X_OK):
174             return tshark
175
176     return None
177
178 def main():
179     parser = OptionParser(usage="usage: %prog [options] index_file [file_1|dir_1 [.. file_n|dir_n]]")
180     parser.add_option("-d", "--dissect-files", dest="dissect_files", default=False, action="store_true",
181                       help="Dissect all matching files")
182     parser.add_option("-m", "--max-files", dest="max_files", default=sys.maxsize, type="int",
183                       help="Max number of files to process")
184     parser.add_option("-b", "--binary-dir", dest="bin_dir", default=os.getcwd(),
185                       help="Directory containing tshark executable")
186     parser.add_option("-c", "--compare-dir", dest="compare_dir", default=None,
187                       help="Directory containing tshark executable which is used for comparison")
188     parser.add_option("-j", dest="num_procs", default=multiprocessing.cpu_count(), type=int,
189                       help="Max number of processes to spawn")
190     parser.add_option("-r", "--randomize", default=False, action="store_true",
191                       help="Randomize the file list order")
192     parser.add_option("", "--list-all-proto", dest="list_all_proto", default=False, action="store_true",
193                       help="List all protocols in index file")
194     parser.add_option("", "--list-all-files", dest="list_all_files", default=False, action="store_true",
195                       help="List all files in index file")
196     parser.add_option("", "--list-all-proto-files", dest="list_all_proto_files", default=False,
197                       metavar="PROTO_1[, .. PROTO_N]",
198                       help="List all files in index file containing the given protocol")
199
200     (options, args) = parser.parse_args()
201
202     if len(args) == 0:
203         parser.error("index_file is a required argument")
204
205     if len(args) == 1 and not index_file_action(options):
206         parser.error("one capture file/directory must be specified")
207
208     if options.dissect_files and not options.list_all_files and not options.list_all_proto_files:
209         parser.error("--list-all-files or --list-all-proto-files must be specified")
210
211     if options.dissect_files and not options.compare_dir is None:
212         parser.error("--dissect-files and --compare-dir cannot be specified at the same time")
213
214     index_file_name = args.pop(0)
215     paths = args
216     cap_hash = {}
217     try:
218         index_file = open(index_file_name, "rb")
219         print("index file: %s [OPENED]" % index_file.name)
220         cap_hash = pickle.load(index_file)
221         index_file.close()
222         print("%d files" % len(cap_hash))
223     except IOError:
224         print("index file: %s [NEW]" % index_file_name)
225
226     if options.list_all_proto:
227         print(list_all_proto(cap_hash))
228         exit(0)
229
230     indexed_files = []
231     if options.list_all_files:
232         indexed_files = list_all_files(cap_hash)
233         print(indexed_files)
234
235     if options.list_all_proto_files:
236         indexed_files = list_all_proto_files(cap_hash, options.list_all_proto_files)
237         print(indexed_files)
238
239     tshark_bin = find_tshark_executable(options.bin_dir)
240     if not tshark_bin is None:
241         print("tshark: %s [FOUND]" % tshark_bin)
242     else:
243         print("tshark: %s [MISSING]" % tshark_bin)
244         exit(1)
245
246     if not options.compare_dir is None:
247         tshark_cmp = find_tshark_executable(options.compare_dir)
248         if not tshark_cmp is None:
249             print("tshark: %s [FOUND]" % tshark_cmp)
250         else:
251             print("tshark: %s [MISSING]" % tshark_cmp)
252             exit(1)
253
254     if options.dissect_files or options.compare_dir:
255         cap_files = indexed_files
256     elif options.list_all_proto_files or options.list_all_files:
257         exit(0)
258     else:
259         cap_files = find_capture_files(paths, cap_hash)
260
261     if options.randomize:
262         random.shuffle(cap_files)
263     else:
264         cap_files.sort()
265
266     options.max_files = min(options.max_files, len(cap_files))
267     print("%u total files, %u working files" % (len(cap_files), options.max_files))
268     cap_files = cap_files[:options.max_files]
269     if options.compare_dir or options.dissect_files:
270         tmpdir = tempfile.mkdtemp()
271         print("Temporary working dir: %s" % tmpdir)
272     try:
273         if options.compare_dir:
274             compare_files(tshark_bin, tmpdir, tshark_cmp, options.num_procs, options.max_files, cap_files)
275         elif options.dissect_files:
276             dissect_files(tshark_bin, tmpdir, options.num_procs, options.max_files, cap_files)
277         else:
278             extract_protos_from_file(tshark_bin, options.num_procs, options.max_files, cap_files, cap_hash, index_file_name)
279     finally:
280         # Dissection may result in a non-empty directory.
281         if options.compare_dir:
282             os.rmdir(tmpdir)
283 if __name__ == "__main__":
284     main()