fix-encoding-args.pl: fix terrible performance with large files
authorPeter Wu <peter@lekensteyn.nl>
Sat, 22 Sep 2018 11:47:22 +0000 (13:47 +0200)
committerAnders Broman <a.broman58@gmail.com>
Sat, 22 Sep 2018 15:44:20 +0000 (15:44 +0000)
"fix-encoding-args.pl epan/dissectors/packet-ieee80211.c" used to take
over 12 seconds to complete. After this change it is reduced to 400ms.
Profiling with Devel::NYTProf showed two issues:
- find_hf_array_entries (5 seconds): matching leading whitespace
  triggers a candidate match against every line. Fix this by removing
  whitespace prior to matching.
- fix_encoding_args_by_hf_type (7.5 seconds): executing 2131 different
  substitution patterns is slow. Fix this by grouping field names and
  execute the substitution only once afterwards (in total 6 calls).

packet-rrc.c is by far the largest file with 215k lines, this used to
take forever (321s) and now completes in 1.3s.

Regression tested by removing "ENC_ASCII" and "ENC_UTF_8" in
dissect_venue_name_info, the expected warnings are still visible.

Change-Id: I071038e8fcb56474ac41223568ce6724258c059d
Reviewed-on: https://code.wireshark.org/review/29789
Petri-Dish: Peter Wu <peter@lekensteyn.nl>
Tested-by: Petri Dish Buildbot
Reviewed-by: Anders Broman <a.broman58@gmail.com>
tools/fix-encoding-args.pl

index a4c4887914f886a37c3939b6ce532acfe7ed7856..f05163ce972c21bdf973387a7bf24dbb489ed16a 100755 (executable)
@@ -390,22 +390,22 @@ sub find_hf_array_entries {
         }
     }
 
+    # pre-process contents to fold multiple lines and speed up matching.
+    $fileContentsWithoutComments =~ s/\s*=\s*/=/gs;
+    $fileContentsWithoutComments =~ s/^\s+//g;
+
     # RegEx to get "proto" variable name
     my $protoRegEx = qr /
-                            ^ \s*                     # note m modifier below
+                            ^                         # note m modifier below
                             (
                                 [a-zA-Z0-9_]+
                             )
-                            \s*
                             =
-                            \s*
-                            proto_register_protocol
-                            \s*
-                            \(
-                        /xoms;
+                            proto_register_protocol\b
+                        /xom;
 
     # Find all registered protocols
-    while ($fileContentsWithoutComments =~ m { $protoRegEx }xgioms ) {
+    while ($fileContentsWithoutComments =~ m { $protoRegEx }xgom ) {
         ##print "$1\n";
         if (exists $hfArrayEntryFieldType{$1}) {
             printf "%-35.35s: ? duplicate 'proto': no fixes done for: $1; manual action may be req'd\n", $fileName;
@@ -517,8 +517,8 @@ sub find_hf_array_entries {
 #      - ref to array containing hf[] types to be processed (FT_STRING, etc)
 #      - ref to hash containing search (keys) and replacement (values) for encoding arg
 #   fcn_name string
-#   ref to hfArrayEntries hash (key: hf name; value: field type)
 #   ref to string containing file contents
+#   ref to hfArrayEntries hash (key: hf name; value: field type)
 #   filename string
 
 {  # block begin
@@ -573,24 +573,32 @@ sub find_hf_array_entries {
             $encArgPat = qr / [^,)]+? /x;
         }
 
+        my @hf_index_names;
+
         # For each hf[] entry which matches a type in %hfTypes do replacements
         $found = 0;
         foreach my $key (keys %$hfArrayEntryFieldTypeHRef) {
             $hf_index_name = $key;
-            $hf_index_name =~ s{ ( \[ | \] ) }{\\$1}xg;     # escape any "[" or "]" characters
             $hf_field_type = $$hfArrayEntryFieldTypeHRef{$key};
             ##printf "--> %-35.35s: %s\n", $hf_index_name,  $hf_field_type;
 
             next unless exists $hfTypes{$hf_field_type};    # Do we want to process for this hf[] entry type ?
 
+            ##print "\n$hf_index_name $hf_field_type\n";
+            push @hf_index_names, $hf_index_name;
+        }
+
+        if (@hf_index_names) {
             # build the complete pattern
+            my $hf_index_names_re = join('|', @hf_index_names);
+            $hf_index_names_re =~ s/\[|\]/\\$&/g;   # escape any "[" or "]" characters
             my $patRegEx = qr /
                                   # part 1: $1
                                   (
                                       $fcn_name \s* \(
                                       [^;]+?
                                       ,\s*
-                                      $hf_index_name
+                                      (?:$hf_index_names_re)
                                       \s*,
                                       [^;]+
                                       ,\s*
@@ -607,7 +615,6 @@ sub find_hf_array_entries {
                                   )
                               /xs;
 
-            ##print "\n$hf_index_name $hf_field_type\n";
             ##print "\n$patRegEx\n";
 
             ## Match and substitute as specified