#2220, write the encoding down when exporting a shape file

author Paul Ramsey <pramsey@cleverelephant.ca>

Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)

committer Paul Ramsey <pramsey@cleverelephant.ca>

Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)
author Paul Ramsey <pramsey@cleverelephant.ca>
Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)
committer Paul Ramsey <pramsey@cleverelephant.ca>
Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)
diff --git a/loader/pgsql2shp-core.c b/loader/pgsql2shp-core.c

index dfb6879f932d58fcf42c00a35a7d957380ef7715..9e3f115602f3ec26501f4b1fbb7f84fa146da319 100644 (file)
--- a/loader/pgsql2shp-core.c
+++ b/loader/pgsql2shp-core.c
@@ -1217,6 +1217,11 @@ ShpDumperGetConnectionStringFromConn(SHPCONNECTIONCONFIG *conn)
                 strcat(connstring, conn->database);
         }
  
+       if ( ! getenv("PGCLIENTENCODING") )
+       {
+               strcat(connstring, " client_encoding=UTF8");
+       }
+
         return connstring;
  }
  
@@ -1417,8 +1422,19 @@ ShpDumperOpenTable(SHPDUMPERSTATE *state)
         else
                 state->shp_file = state->table;
  
-       /* Create the dbf file */
-       state->dbf = DBFCreate(state->shp_file);
+       /* Create the dbf file: */
+       /* If there's a user-specified encoding hanging around, try and use that. */
+       /* Otherwise, just use UTF-8 encoding, since that's usually our client encoding. */
+       if ( getenv("PGCLIENTENCODING") )
+       {
+               char *codepage = encoding2codepage(getenv("PGCLIENTENCODING"));
+               state->dbf = DBFCreateEx(state->shp_file, codepage);
+       }
+       else
+       {
+               state->dbf = DBFCreateEx(state->shp_file, "UTF-8");
+       }
+               
         if (!state->dbf)
         {
                 snprintf(state->message, SHPDUMPERMSGLEN, _("Could not create dbf file %s"), state->shp_file);
diff --git a/loader/shp2pgsql-core.c b/loader/shp2pgsql-core.c

index 5fb3d06de32a10915e7e11c471d12981d74e6a2a..460baffb7ae11a7929d15c9a09b223855be573a7 100644 (file)
--- a/loader/shp2pgsql-core.c
+++ b/loader/shp2pgsql-core.c
@@ -20,80 +20,6 @@
  #include "../liblwgeom/lwgeom_log.h" /* for LWDEBUG macros */
  
  
-typedef struct 
-{
-    int ldid;
-    int cpg;
-    char *desc;
-    char *iconv;
-    char *pg;
-} code_page_entry;
-
-static int num_code_pages = 60;
-
-static code_page_entry code_pages[] = {
-    {0x01, 437, "U.S. MS-DOS", "CP437",""},
-    {0x02, 850, "International MS-DOS", "CP850",""},
-    {0x03, 1252, "Window ANSI", "WINDOWS-1252","WIN1252"},
-    {0x08, 865, "Danish OEM", "CP865",""},
-    {0x09, 437, "Dutch OEM", "CP437",""},
-    {0x0A, 850, "Dutch OEM*", "CP850",""},
-    {0x0B, 437, "Finnish OEM", "CP437",""},
-    {0x0D, 437, "French OEM", "CP437",""},
-    {0x0E, 850, "French OEM*", "CP850",""},
-    {0x0F, 437, "German OEM", "CP437",""},
-    {0x10, 850, "German OEM*", "CP850",""},
-    {0x11, 437, "Italian OEM", "CP437",""},
-    {0x12, 850, "Italian OEM*", "CP850",""},
-    {0x13, 932, "Japanese Shift-JIS", "CP932","SJIS"},
-    {0x14, 850, "Spanish OEM*", "CP850",""},
-    {0x15, 437, "Swedish OEM", "CP437",""},
-    {0x16, 850, "Swedish OEM*", "CP850",""},
-    {0x17, 865, "Norwegian OEM", "CP865",""},
-    {0x18, 437, "Spanish OEM", "CP865",""},
-    {0x19, 437, "English OEM (Britain)", "CP437",""},
-    {0x1A, 850, "English OEM (Britain)*", "CP850",""},
-    {0x1B, 437, "English OEM (U.S.)", "CP437",""},
-    {0x1C, 863, "French OEM (Canada)", "CP863",""},
-    {0x1D, 850, "French OEM*", "CP850",""},
-    {0x1F, 852, "Czech OEM", "CP852",""},
-    {0x22, 852, "Hungarian OEM", "CP852",""},
-    {0x23, 852, "Polish OEM", "CP852",""},
-    {0x24, 860, "Portugese OEM", "CP860",""},
-    {0x25, 850, "Potugese OEM*", "CP850",""},
-    {0x26, 866, "Russian OEM", "WINDOWS-866","WIN866"},
-    {0x37, 850, "English OEM (U.S.)*", "CP850",""},
-    {0x40, 852, "Romanian OEM", "CP852",""},
-    {0x4D, 936, "Chinese GBK (PRC)", "CP936",""},
-    {0x4E, 949, "Korean (ANSI/OEM)", "CP949",""},
-    {0x4F, 950, "Chinese Big 5 (Taiwan)", "CP950","BIG5"},
-    {0x50, 874, "Thai (ANSI/OEM)", "WIN874",""},
-    {0x57, 1252, "ANSI", "WINDOWS-1252",""},
-    {0x58, 1252, "Western European ANSI", "WINDOWS-1252",""},
-    {0x59, 1252, "Spanish ANSI", "WINDOWS-1252",""},
-    {0x64, 852, "Eastern European MS-DOS", "CP852",""},
-    {0x65, 866, "Russian MS-DOS", "CP866",""},
-    {0x66, 865, "Nordic MS-DOS", "CP865",""},
-    {0x67, 861, "Icelandic MS-DOS", "",""},
-    {0x6A, 737, "Greek MS-DOS (437G)", "CP737",""},
-    {0x6B, 857, "Turkish MS-DOS", "CP857",""},
-    {0x6C, 863, "French-Canadian MS-DOS", "CP863",""},
-    {0x78, 950, "Taiwan Big 5", "CP950",""},
-    {0x79, 949, "Hangul (Wansung)", "CP949",""},
-    {0x7A, 936, "PRC GBK", "CP936","GBK"},
-    {0x7B, 932, "Japanese Shift-JIS", "CP932",""},
-    {0x7C, 874, "Thai Windows/MS-DOS", "WINDOWS-874","WIN874"},
-    {0x86, 737, "Greek OEM", "CP737",""},
-    {0x87, 852, "Slovenian OEM", "CP852",""},
-    {0x88, 857, "Turkish OEM", "CP857",""},
-    {0xC8, 1250, "Eastern European Windows", "WINDOWS-1250","WIN1250"},
-    {0xC9, 1251, "Russian Windows", "WINDOWS-1251","WIN1251"},
-    {0xCA, 1254, "Turkish Windows", "WINDOWS-1254","WIN1254"},
-    {0xCB, 1253, "Greek Windows", "WINDOWS-1253","WIN1253"},
-    {0xCC, 1257, "Baltic Window", "WINDOWS-1257","WIN1257"},
-    {0xFF, 65001, "UTF-8", "UTF-8","UTF8"}
-};
-
  
  /* Internal ring/point structures */
  typedef struct struct_point
@@ -128,58 +54,6 @@ int FindPolygons(SHPObject *obj, Ring ***Out);
  void ReleasePolygons(Ring **polys, int npolys);
  int GeneratePolygonGeometry(SHPLOADERSTATE *state, SHPObject *obj, char **geometry);
  
-/*
-* Code page info will come out of dbfopen as either a bare codepage number
-* (e.g. 1256) or as "LDID/1234" from the DBF hreader. 
-*/
-static char *
-codepage2encoding(const char *cpg)
-{
-    int cpglen;
-    int is_ldid = 0;
-    int num, i;
-    
-    /* Do nothing on nothing. */
-    if ( ! cpg ) return NULL;
-    
-    /* Is this an LDID string? */
-    /* If so, note it and move past the "LDID/" tag */
-    cpglen = strlen(cpg);
-    if ( strstr(cpg, "LDID/") )
-    {
-        if ( cpglen > 5 )
-        {
-            cpg += 5;
-            is_ldid = 1;
-        }
-        else
-        {
-            return NULL;
-        }
-    }
-    
-    /* Read the number */
-    num = atoi(cpg);
-    
-    /* Can we find this number in our lookup table? */
-    for ( i = is_ldid ; i < num_code_pages; i++ )
-    {
-        if ( is_ldid )
-        {
-            if ( code_pages[i].ldid == num )
-                return strdup(code_pages[i].iconv);
-        }
-        else
-        {
-            if ( code_pages[i].cpg == num )
-                return strdup(code_pages[i].iconv);
-        }
-    }
-    
-    /* Didn't find a matching entry */
-    return NULL;
-    
-}
  
  /* Return allocated string containing UTF8 string converted from encoding fromcode */
  static int 
diff --git a/loader/shpcommon.c b/loader/shpcommon.c

index 8d252229dfe49eaec93f35772477b85dee16acf7..e3f0b412689ac361ca359c49bc45b0ca0c3bd112 100644 (file)
--- a/loader/shpcommon.c
+++ b/loader/shpcommon.c
@@ -196,3 +196,86 @@ colmap_read(const char *filename, colmap *map, char *errbuf, size_t errbuflen)
    return 1;
  }
  
+/*
+* Code page info will come out of dbfopen as either a bare codepage number
+* (e.g. 1256) or as "LDID/1234" from the DBF hreader. We want to look up 
+* the equivalent iconv encoding string so we can use iconv to transcode
+* the data into UTF8
+*/
+char *
+codepage2encoding(const char *cpg)
+{
+    int cpglen;
+    int is_ldid = 0;
+    int num, i;
+    
+    /* Do nothing on nothing. */
+    if ( ! cpg ) return NULL;
+    
+    /* Is this an LDID string? */
+    /* If so, note it and move past the "LDID/" tag */
+    cpglen = strlen(cpg);
+    if ( strstr(cpg, "LDID/") )
+    {
+        if ( cpglen > 5 )
+        {
+            cpg += 5;
+            is_ldid = 1;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+    
+    /* Read the number */
+    num = atoi(cpg);
+    
+    /* Can we find this number in our lookup table? */
+    for ( i = is_ldid ; i < num_code_pages; i++ )
+    {
+        if ( is_ldid )
+        {
+            if ( code_pages[i].ldid == num )
+                return strdup(code_pages[i].iconv);
+        }
+        else
+        {
+            if ( code_pages[i].cpg == num )
+                return strdup(code_pages[i].iconv);
+        }
+    }
+    
+    /* Didn't find a matching entry */
+    return NULL;
+    
+}
+
+/*
+* In the case where data is coming out of the database in some wierd encoding
+* we want to look up the appropriate code page entry to feed to DBFCreateEx
+*/
+char *
+encoding2codepage(const char *encoding)
+{
+       int i;
+       for ( i = 0; i < num_code_pages; i++ )
+       {
+               if ( strcasecmp(encoding, code_pages[i].pg) == 0 )
+               {
+                       if ( code_pages[i].ldid == 0xFF )
+                       {
+                               return strdup("UTF-8");
+                       }
+                       else
+                       {
+                               char *codepage = NULL;
+                               asprintf(&codepage, "LDID/%d", code_pages[i].ldid);
+                               return codepage;
+                       }
+               }
+       }
+
+       /* OK, we give up, pretend it's UTF8 */
+       return strdup("UTF-8");
+}
diff --git a/loader/shpcommon.h b/loader/shpcommon.h

index 89a5beb033143c681c023efcd9ee5882dd4fb054..817c7b2110e8385351792c78d7d05333c196403c 100644 (file)
--- a/loader/shpcommon.h
+++ b/loader/shpcommon.h
@@ -24,6 +24,87 @@
  #define _(String) String
  #endif
  
+
+
+typedef struct 
+{
+    int ldid;
+    int cpg;
+    char *desc;
+    char *iconv;
+    char *pg;
+} code_page_entry;
+
+static int num_code_pages = 60;
+
+/* http://www.autopark.ru/ASBProgrammerGuide/DBFSTRUC.HTM */
+/* http://resources.arcgis.com/fr/content/kbase?fa=articleShow&d=21106 */
+
+static code_page_entry code_pages[] = {
+    {0x01, 437, "U.S. MS-DOS", "CP437",""},
+    {0x02, 850, "International MS-DOS", "CP850",""},
+    {0x03, 1252, "Window ANSI", "WINDOWS-1252","WIN1252"},
+    {0x08, 865, "Danish OEM", "CP865",""},
+    {0x09, 437, "Dutch OEM", "CP437",""},
+    {0x0A, 850, "Dutch OEM*", "CP850",""},
+    {0x0B, 437, "Finnish OEM", "CP437",""},
+    {0x0D, 437, "French OEM", "CP437",""},
+    {0x0E, 850, "French OEM*", "CP850",""},
+    {0x0F, 437, "German OEM", "CP437",""},
+    {0x10, 850, "German OEM*", "CP850",""},
+    {0x11, 437, "Italian OEM", "CP437",""},
+    {0x12, 850, "Italian OEM*", "CP850",""},
+    {0x13, 932, "Japanese Shift-JIS", "CP932","SJIS"},
+    {0x14, 850, "Spanish OEM*", "CP850",""},
+    {0x15, 437, "Swedish OEM", "CP437",""},
+    {0x16, 850, "Swedish OEM*", "CP850",""},
+    {0x17, 865, "Norwegian OEM", "CP865",""},
+    {0x18, 437, "Spanish OEM", "CP865",""},
+    {0x19, 437, "English OEM (Britain)", "CP437",""},
+    {0x1A, 850, "English OEM (Britain)*", "CP850",""},
+    {0x1B, 437, "English OEM (U.S.)", "CP437",""},
+    {0x1C, 863, "French OEM (Canada)", "CP863",""},
+    {0x1D, 850, "French OEM*", "CP850",""},
+    {0x1F, 852, "Czech OEM", "CP852",""},
+    {0x22, 852, "Hungarian OEM", "CP852",""},
+    {0x23, 852, "Polish OEM", "CP852",""},
+    {0x24, 860, "Portugese OEM", "CP860",""},
+    {0x25, 850, "Potugese OEM*", "CP850",""},
+    {0x26, 866, "Russian OEM", "WINDOWS-866","WIN866"},
+    {0x37, 850, "English OEM (U.S.)*", "CP850",""},
+    {0x40, 852, "Romanian OEM", "CP852",""},
+    {0x4D, 936, "Chinese GBK (PRC)", "CP936",""},
+    {0x4E, 949, "Korean (ANSI/OEM)", "CP949",""},
+    {0x4F, 950, "Chinese Big 5 (Taiwan)", "CP950","BIG5"},
+    {0x50, 874, "Thai (ANSI/OEM)", "WIN874",""},
+    {0x57, 1252, "ANSI", "WINDOWS-1252",""},
+    {0x58, 1252, "Western European ANSI", "WINDOWS-1252",""},
+    {0x59, 1252, "Spanish ANSI", "WINDOWS-1252",""},
+    {0x64, 852, "Eastern European MS-DOS", "CP852",""},
+    {0x65, 866, "Russian MS-DOS", "CP866",""},
+    {0x66, 865, "Nordic MS-DOS", "CP865",""},
+    {0x67, 861, "Icelandic MS-DOS", "",""},
+    {0x6A, 737, "Greek MS-DOS (437G)", "CP737",""},
+    {0x6B, 857, "Turkish MS-DOS", "CP857",""},
+    {0x6C, 863, "French-Canadian MS-DOS", "CP863",""},
+    {0x78, 950, "Taiwan Big 5", "CP950",""},
+    {0x79, 949, "Hangul (Wansung)", "CP949",""},
+    {0x7A, 936, "PRC GBK", "CP936","GBK"},
+    {0x7B, 932, "Japanese Shift-JIS", "CP932",""},
+    {0x7C, 874, "Thai Windows/MS-DOS", "WINDOWS-874","WIN874"},
+    {0x86, 737, "Greek OEM", "CP737",""},
+    {0x87, 852, "Slovenian OEM", "CP852",""},
+    {0x88, 857, "Turkish OEM", "CP857",""},
+    {0xC8, 1250, "Eastern European Windows", "WINDOWS-1250","WIN1250"},
+    {0xC9, 1251, "Russian Windows", "WINDOWS-1251","WIN1251"},
+    {0xCA, 1254, "Turkish Windows", "WINDOWS-1254","WIN1254"},
+    {0xCB, 1253, "Greek Windows", "WINDOWS-1253","WIN1253"},
+    {0xCC, 1257, "Baltic Window", "WINDOWS-1257","WIN1257"},
+    {0xFF, 65001, "UTF-8", "UTF-8","UTF8"}
+};
+
+
+
  typedef struct shp_connection_state
  {
         /* PgSQL username to log in with */
@@ -100,4 +181,7 @@ const char *colmap_dbf_by_pg(colmap *map, const char *pgname);
  
  const char *colmap_pg_by_dbf(colmap *map, const char *dbfname);
  
+char *codepage2encoding(const char *cpg);
+char *encoding2codepage(const char *encoding);
+
  #endif
diff --git a/regress/dumper/literalsrid_expected.dbf b/regress/dumper/literalsrid_expected.dbf

index df11d721a0095ed428f98262d47d1209f098a65c..cafd86349ca8519a404e4c12f94b486870695a65 100644 (file)

Binary files a/regress/dumper/literalsrid_expected.dbf and b/regress/dumper/literalsrid_expected.dbf differ
diff --git a/regress/dumper/realtable_expected.dbf b/regress/dumper/realtable_expected.dbf

index 0d2ac503742b5b72b89495d5f834756a32d946db..7f420e416b9b5a693b7204da214527482f99152f 100644 (file)

Binary files a/regress/dumper/realtable_expected.dbf and b/regress/dumper/realtable_expected.dbf differ
author	Paul Ramsey <pramsey@cleverelephant.ca>
	Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)
committer	Paul Ramsey <pramsey@cleverelephant.ca>
	Wed, 23 Sep 2015 21:04:09 +0000 (21:04 +0000)
loader/pgsql2shp-core.c		patch \| blob \| history
loader/shp2pgsql-core.c		patch \| blob \| history
loader/shpcommon.c		patch \| blob \| history
loader/shpcommon.h		patch \| blob \| history
regress/dumper/literalsrid_expected.dbf		patch \| blob \| history
regress/dumper/realtable_expected.dbf		patch \| blob \| history