Re: BUG #18735: Specific multibyte character in psql file path command parameter for Windows - Mailing list pgsql-bugs

From Tom Lane
Subject Re: BUG #18735: Specific multibyte character in psql file path command parameter for Windows
Date
Msg-id 3768611.1733698512@sss.pgh.pa.us
Whole thread Raw
In response to Re: BUG #18735: Specific multibyte character in psql file path command parameter for Windows  (Tatsuo Ishii <ishii@postgresql.org>)
List pgsql-bugs
Tatsuo Ishii <ishii@postgresql.org> writes:
>> Another way, given that we only really need this to work for
>> SJIS, is to hard-wire the logic into path.c --- it's not like
>> pg_sjis_mblen is either long or likely to change.  That's
>> ugly but would be a lot less invasive and safer to back-patch.
>> I'm leaning a bit to the second way, mainly because of the
>> extern-relocation annoyance.

> +1. I believe the logic of detecting byte length in Shift-JIS will not
> be changed.

OK.  I know I said I wasn't going to write this, but here's a draft
patch that assumes we need only touch psql and can rely on its idea
of client_encoding.

I'm not in a position to test this.

            regards, tom lane

diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 1f3cbb11f7..bffe614f00 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -1201,7 +1201,7 @@ exec_command_edit(PsqlScanState scan_state, bool active_branch,
                 expand_tilde(&fname);
                 if (fname)
                 {
-                    canonicalize_path(fname);
+                    canonicalize_path_enc(fname, pset.encoding);
                     /* Always clear buffer if the file isn't modified */
                     discard_on_quit = true;
                 }
@@ -2819,7 +2819,7 @@ exec_command_write(PsqlScanState scan_state, bool active_branch,
                 }
                 else
                 {
-                    canonicalize_path(fname);
+                    canonicalize_path_enc(fname, pset.encoding);
                     fd = fopen(fname, "w");
                 }
                 if (!fd)
@@ -4423,7 +4423,7 @@ process_file(char *filename, bool use_relative_path)
     }
     else if (strcmp(filename, "-") != 0)
     {
-        canonicalize_path(filename);
+        canonicalize_path_enc(filename, pset.encoding);

         /*
          * If we were asked to resolve the pathname relative to the location
@@ -4437,7 +4437,7 @@ process_file(char *filename, bool use_relative_path)
             strlcpy(relpath, pset.inputfile, sizeof(relpath));
             get_parent_directory(relpath);
             join_path_components(relpath, relpath, filename);
-            canonicalize_path(relpath);
+            canonicalize_path_enc(relpath, pset.encoding);

             filename = relpath;
         }
diff --git a/src/bin/psql/copy.c b/src/bin/psql/copy.c
index e020e4d665..1cfe31f164 100644
--- a/src/bin/psql/copy.c
+++ b/src/bin/psql/copy.c
@@ -280,7 +280,7 @@ do_copy(const char *args)

     /* prepare to read or write the target file */
     if (options->file && !options->program)
-        canonicalize_path(options->file);
+        canonicalize_path_enc(options->file, pset.encoding);

     if (options->from)
     {
diff --git a/src/include/port.h b/src/include/port.h
index ba9ab0d34f..1665ac0258 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -53,6 +53,7 @@ extern char *first_path_var_separator(const char *pathlist);
 extern void join_path_components(char *ret_path,
                                  const char *head, const char *tail);
 extern void canonicalize_path(char *path);
+extern void canonicalize_path_enc(char *path, int encoding);
 extern void make_native_path(char *filename);
 extern void cleanup_path(char *path);
 extern bool path_contains_parent_reference(const char *path);
diff --git a/src/port/path.c b/src/port/path.c
index de4df6cd78..02ab3188db 100644
--- a/src/port/path.c
+++ b/src/port/path.c
@@ -36,6 +36,7 @@
 #include <unistd.h>
 #endif

+#include "mb/pg_wchar.h"
 #include "pg_config_paths.h"


@@ -45,6 +46,10 @@
 #define IS_PATH_VAR_SEP(ch) ((ch) == ';')
 #endif

+#ifdef WIN32
+static void debackslash_path(char *path, int encoding);
+static int    pg_sjis_mblen(const unsigned char *s);
+#endif
 static void make_relative_path(char *ret_path, const char *target_path,
                                const char *bin_path, const char *my_exec_path);
 static char *trim_directory(char *path);
@@ -149,10 +154,73 @@ last_dir_separator(const char *filename)
 }


+#ifdef WIN32
+
+/*
+ * Convert '\' to '/' within the given path, assuming the path
+ * is in the specified encoding.
+ */
+static void
+debackslash_path(char *path, int encoding)
+{
+    char       *p;
+
+    /*
+     * Of the supported encodings, only Shift-JIS has multibyte characters
+     * that can include a byte equal to '\' (0x5C).  So rather than implement
+     * a fully encoding-aware conversion, we special-case SJIS.  (Invoking the
+     * general encoding-aware logic in wchar.c is impractical here for
+     * assorted reasons.)
+     */
+    if (encoding == PG_SJIS)
+    {
+        for (p = path; *p; p += pg_sjis_mblen((const unsigned char *) p))
+        {
+            if (*p == '\\')
+                *p = '/';
+        }
+    }
+    else
+    {
+        for (p = path; *p; p++)
+        {
+            if (*p == '\\')
+                *p = '/';
+        }
+    }
+}
+
 /*
- *    make_native_path - on WIN32, change / to \ in the path
+ * SJIS character length
  *
- *    This effectively undoes canonicalize_path.
+ * This must match the behavior of
+ *        pg_encoding_mblen_bounded(PG_SJIS, s)
+ * In particular, unlike the version of pg_sjis_mblen in src/common/wchar.c,
+ * do not allow caller to accidentally step past end-of-string.
+ */
+static int
+pg_sjis_mblen(const unsigned char *s)
+{
+    int            len;
+
+    if (*s >= 0xa1 && *s <= 0xdf)
+        len = 1;                /* 1 byte kana? */
+    else if (IS_HIGHBIT_SET(*s) && s[1] != '\0')
+        len = 2;                /* kanji? */
+    else
+        len = 1;                /* should be ASCII */
+    return len;
+}
+
+#endif                            /* WIN32 */
+
+
+/*
+ *    make_native_path - on WIN32, change '/' to '\' in the path
+ *
+ *    This reverses the '\'-to-'/' transformation of debackslash_path.
+ *    We need not worry about encodings here, since '/' does not appear
+ *    as a byte of a multibyte character in any supported encoding.
  *
  *    This is required because WIN32 COPY is an internal CMD.EXE
  *    command and doesn't process forward slashes in the same way
@@ -182,13 +250,14 @@ make_native_path(char *filename)
  * on Windows. We need them to use filenames without spaces, for which a
  * short filename is the safest equivalent, eg:
  *        C:/Progra~1/
+ *
+ * Presently, this is only used on paths that we can assume are in a
+ * backend-safe encoding, so there's no need for an encoding-aware variant.
  */
 void
 cleanup_path(char *path)
 {
 #ifdef WIN32
-    char       *ptr;
-
     /*
      * GetShortPathName() will fail if the path does not exist, or short names
      * are disabled on this file system.  In both cases, we just return the
@@ -198,11 +267,8 @@ cleanup_path(char *path)
     GetShortPathName(path, path, MAXPGPATH - 1);

     /* Replace '\' with '/' */
-    for (ptr = path; *ptr; ptr++)
-    {
-        if (*ptr == '\\')
-            *ptr = '/';
-    }
+    /* All server-safe encodings are alike here, so just use PG_SQL_ASCII */
+    debackslash_path(path, PG_SQL_ASCII);
 #endif
 }

@@ -253,6 +319,8 @@ typedef enum
 } canonicalize_state;

 /*
+ * canonicalize_path()
+ *
  *    Clean up path by:
  *        o  make Win32 path use Unix slashes
  *        o  remove trailing quote on Win32
@@ -260,9 +328,20 @@ typedef enum
  *        o  remove duplicate (adjacent) separators
  *        o  remove '.' (unless path reduces to only '.')
  *        o  process '..' ourselves, removing it if possible
+ *    Modifies path in-place.
+ *
+ * This comes in two variants: encoding-aware and not.  The non-aware version
+ * is only safe to use on strings that are in a server-safe encoding.
  */
 void
 canonicalize_path(char *path)
+{
+    /* All server-safe encodings are alike here, so just use PG_SQL_ASCII */
+    canonicalize_path_enc(path, PG_SQL_ASCII);
+}
+
+void
+canonicalize_path_enc(char *path, int encoding)
 {
     char       *p,
                *to_p;
@@ -278,17 +357,15 @@ canonicalize_path(char *path)
     /*
      * The Windows command processor will accept suitably quoted paths with
      * forward slashes, but barfs badly with mixed forward and back slashes.
+     * Hence, start by converting all back slashes to forward slashes.
      */
-    for (p = path; *p; p++)
-    {
-        if (*p == '\\')
-            *p = '/';
-    }
+    debackslash_path(path, encoding);

     /*
      * In Win32, if you do: prog.exe "a b" "\c\d\" the system will pass \c\d"
      * as argv[2], so trim off trailing quote.
      */
+    p = path + strlen(path);
     if (p > path && *(p - 1) == '"')
         *(p - 1) = '/';
 #endif

pgsql-bugs by date:

Previous
From: PG Bug reporting form
Date:
Subject: BUG #18741: Detaching a partition referencing a partitioned table fails with a trigger-related error
Next
From: Tender Wang
Date:
Subject: Re: BUG #18741: Detaching a partition referencing a partitioned table fails with a trigger-related error