TShark: Convert TTY output.
authorGerald Combs <gerald@wireshark.org>
Fri, 6 May 2016 17:25:02 +0000 (10:25 -0700)
committerAnders Broman <a.broman58@gmail.com>
Mon, 9 May 2016 04:11:29 +0000 (04:11 +0000)
If we detect that we're writing to a TTY and that it doesn't support
UTF-8, convert our output to the current code page on UNIX/Linux or
to UTF-16LE on Windows. This helps to ensure that we don't fill users'
screens with mojibake, along with scrubbing invalid output.

Add a note about our output behavior to the TShark man page. Add a note
about the glyphs we should and shouldn't be using to utf8_entities.h.

Bug: 12393

Change-Id: I52b6dd240173b80ffb6d35b5950a46a565c97ce8
Reviewed-on: https://code.wireshark.org/review/15277
Reviewed-by: Gerald Combs <gerald@wireshark.org>
Petri-Dish: Gerald Combs <gerald@wireshark.org>
Tested-by: Petri Dish Buildbot <buildbot-no-reply@wireshark.org>
Reviewed-by: Graham Bloice <graham.bloice@trihedral.com>
Reviewed-by: Anders Broman <a.broman58@gmail.com>
doc/tshark.pod
epan/print_stream.c
wsutil/utf8_entities.h

index fb88d53d9af6a97c634a5702e4e208653a7ccbe9..77082c7a1e16cb0962900d9fd2952db1d639b96d 100644 (file)
@@ -1741,6 +1741,20 @@ personal preferences file.
 
 =back
 
+=head1 OUTPUT
+
+B<TShark> uses UTF-8 to represent strings internally. In some cases the
+output might not be valid. For example, a dissector might generate
+invalid UTF-8 character sequences. Programs reading B<TShark> output
+should expect UTF-8 and be prepared for invalid output.
+
+If B<TShark> detects that it is writing to a TTY on UNIX or Linux and
+the locale does not support UTF-8, output will be re-encoded to match the
+current locale.
+
+If B<TShark> detects that it is writing to a TTY on Windows, output will be
+encoded as UTF-16LE.
+
 =head1 ENVIRONMENT VARIABLES
 
 =over 4
index f53965c75b6dacdd8d2ef6b1f6385932bd1194a2..740773af2eb4a6124587b6a1fe309ef5b1696fb1 100644 (file)
 
 #include <stdio.h>
 
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <string.h>
+#endif
+
 #include <glib.h>
 
 #include <epan/print_stream.h>
@@ -104,6 +110,13 @@ typedef struct {
 
 #define MAX_INDENT    160
 
+#ifdef _WIN32
+static char *to_codeset = "UTF-16LE";
+#else
+static char *tty_codeset = NULL;
+static char *to_codeset = NULL;
+#endif
+
 static gboolean
 print_line_text(print_stream_t *self, int indent, const char *line)
 {
@@ -128,7 +141,41 @@ print_line_text(print_stream_t *self, int indent, const char *line)
 
     ret = fwrite(spaces, 1, num_spaces, output->fh);
     if (ret == num_spaces) {
-        fputs(line, output->fh);
+        gchar *tty_out = NULL;
+
+#ifndef _WIN32
+        /* Is there a more reliable way to do this? */
+        if (!tty_codeset) {
+            gchar *upper_codeset;
+
+            tty_codeset = g_get_codeset();
+            upper_codeset = g_ascii_strup(tty_codeset, -1);
+            if (!strstr(upper_codeset, "UTF-8") && !strstr(upper_codeset, "UTF8")) {
+                to_codeset = tty_codeset;
+            }
+            g_free(upper_codeset);
+        }
+#endif
+
+        if (ws_isatty(ws_fileno(output->fh)) && to_codeset) {
+            /* XXX Allocating a fresh buffer every line probably isn't the
+             * most efficient way to do this. However, this has the side
+             * effect of scrubbing invalid output.
+             */
+            tty_out = g_convert_with_fallback(line, -1, to_codeset, "UTF-8", "?", NULL, NULL, NULL);
+        }
+
+        if (tty_out) {
+#ifdef _WIN32
+            DWORD out_len = (DWORD) wcslen((wchar_t *) tty_out);
+            WriteConsoleW((HANDLE)_get_osfhandle(_fileno(output->fh)), tty_out, out_len, &out_len, NULL);
+#else
+            fputs(tty_out, output->fh);
+#endif
+            g_free(tty_out);
+        } else {
+            fputs(line, output->fh);
+        }
         putc('\n', output->fh);
     }
     return !ferror(output->fh);
index dc5deba55474e46213261886bf2376505ea500c0..13dba64270c95de807e5a12291da13e16ce73d4d 100644 (file)
  * http://www.fileformat.info/info/unicode/
  * http://www.utf8-chartable.de/
  * and other places
+ *
+ * While many modern systems default to UTF-8 and handle it well, some do
+ * not. The Windows console is a notable example. When adding a glyph below
+ * you probably shouldn't stray too far from code page 437 or WGL4:
+ * https://en.wikipedia.org/wiki/Code_page_437
+ * https://en.wikipedia.org/wiki/Windows_Glyph_List_4
  */
 
 #define UTF8_DEGREE_SIGN                    "\xc2\xb0"      /*   176 /   0xb0 */
@@ -43,6 +49,7 @@
 #define UTF8_RIGHTWARDS_ARROW           "\xe2\x86\x92"      /*  8594 / 0x2192 */
 #define UTF8_LEFT_RIGHT_ARROW           "\xe2\x86\x94"      /*  8596 / 0x2194 */
 
+/* OS X command key */
 #define UTF8_PLACE_OF_INTEREST_SIGN     "\xe2\x8c\x98"      /*  8984 / 0x2318 */
 
 #define UTF8_SYMBOL_FOR_NULL            "\xe2\x90\x80"      /*  9216 / 0x2400 */