From: Paul Ramsey Date: Wed, 23 Sep 2015 21:04:09 +0000 (+0000) Subject: #2220, write the encoding down when exporting a shape file X-Git-Tag: 2.2.0rc1~12 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=32f2ea28aae3ccb4d03805c30a23ef096fc41201;p=postgis #2220, write the encoding down when exporting a shape file git-svn-id: http://svn.osgeo.org/postgis/trunk@14090 b70326c6-7e19-0410-871a-916f4a2858ee --- diff --git a/loader/pgsql2shp-core.c b/loader/pgsql2shp-core.c index dfb6879f9..9e3f11560 100644 --- a/loader/pgsql2shp-core.c +++ b/loader/pgsql2shp-core.c @@ -1217,6 +1217,11 @@ ShpDumperGetConnectionStringFromConn(SHPCONNECTIONCONFIG *conn) strcat(connstring, conn->database); } + if ( ! getenv("PGCLIENTENCODING") ) + { + strcat(connstring, " client_encoding=UTF8"); + } + return connstring; } @@ -1417,8 +1422,19 @@ ShpDumperOpenTable(SHPDUMPERSTATE *state) else state->shp_file = state->table; - /* Create the dbf file */ - state->dbf = DBFCreate(state->shp_file); + /* Create the dbf file: */ + /* If there's a user-specified encoding hanging around, try and use that. */ + /* Otherwise, just use UTF-8 encoding, since that's usually our client encoding. */ + if ( getenv("PGCLIENTENCODING") ) + { + char *codepage = encoding2codepage(getenv("PGCLIENTENCODING")); + state->dbf = DBFCreateEx(state->shp_file, codepage); + } + else + { + state->dbf = DBFCreateEx(state->shp_file, "UTF-8"); + } + if (!state->dbf) { snprintf(state->message, SHPDUMPERMSGLEN, _("Could not create dbf file %s"), state->shp_file); diff --git a/loader/shp2pgsql-core.c b/loader/shp2pgsql-core.c index 5fb3d06de..460baffb7 100644 --- a/loader/shp2pgsql-core.c +++ b/loader/shp2pgsql-core.c @@ -20,80 +20,6 @@ #include "../liblwgeom/lwgeom_log.h" /* for LWDEBUG macros */ -typedef struct -{ - int ldid; - int cpg; - char *desc; - char *iconv; - char *pg; -} code_page_entry; - -static int num_code_pages = 60; - -static code_page_entry code_pages[] = { - {0x01, 437, "U.S. MS-DOS", "CP437",""}, - {0x02, 850, "International MS-DOS", "CP850",""}, - {0x03, 1252, "Window ANSI", "WINDOWS-1252","WIN1252"}, - {0x08, 865, "Danish OEM", "CP865",""}, - {0x09, 437, "Dutch OEM", "CP437",""}, - {0x0A, 850, "Dutch OEM*", "CP850",""}, - {0x0B, 437, "Finnish OEM", "CP437",""}, - {0x0D, 437, "French OEM", "CP437",""}, - {0x0E, 850, "French OEM*", "CP850",""}, - {0x0F, 437, "German OEM", "CP437",""}, - {0x10, 850, "German OEM*", "CP850",""}, - {0x11, 437, "Italian OEM", "CP437",""}, - {0x12, 850, "Italian OEM*", "CP850",""}, - {0x13, 932, "Japanese Shift-JIS", "CP932","SJIS"}, - {0x14, 850, "Spanish OEM*", "CP850",""}, - {0x15, 437, "Swedish OEM", "CP437",""}, - {0x16, 850, "Swedish OEM*", "CP850",""}, - {0x17, 865, "Norwegian OEM", "CP865",""}, - {0x18, 437, "Spanish OEM", "CP865",""}, - {0x19, 437, "English OEM (Britain)", "CP437",""}, - {0x1A, 850, "English OEM (Britain)*", "CP850",""}, - {0x1B, 437, "English OEM (U.S.)", "CP437",""}, - {0x1C, 863, "French OEM (Canada)", "CP863",""}, - {0x1D, 850, "French OEM*", "CP850",""}, - {0x1F, 852, "Czech OEM", "CP852",""}, - {0x22, 852, "Hungarian OEM", "CP852",""}, - {0x23, 852, "Polish OEM", "CP852",""}, - {0x24, 860, "Portugese OEM", "CP860",""}, - {0x25, 850, "Potugese OEM*", "CP850",""}, - {0x26, 866, "Russian OEM", "WINDOWS-866","WIN866"}, - {0x37, 850, "English OEM (U.S.)*", "CP850",""}, - {0x40, 852, "Romanian OEM", "CP852",""}, - {0x4D, 936, "Chinese GBK (PRC)", "CP936",""}, - {0x4E, 949, "Korean (ANSI/OEM)", "CP949",""}, - {0x4F, 950, "Chinese Big 5 (Taiwan)", "CP950","BIG5"}, - {0x50, 874, "Thai (ANSI/OEM)", "WIN874",""}, - {0x57, 1252, "ANSI", "WINDOWS-1252",""}, - {0x58, 1252, "Western European ANSI", "WINDOWS-1252",""}, - {0x59, 1252, "Spanish ANSI", "WINDOWS-1252",""}, - {0x64, 852, "Eastern European MS-DOS", "CP852",""}, - {0x65, 866, "Russian MS-DOS", "CP866",""}, - {0x66, 865, "Nordic MS-DOS", "CP865",""}, - {0x67, 861, "Icelandic MS-DOS", "",""}, - {0x6A, 737, "Greek MS-DOS (437G)", "CP737",""}, - {0x6B, 857, "Turkish MS-DOS", "CP857",""}, - {0x6C, 863, "French-Canadian MS-DOS", "CP863",""}, - {0x78, 950, "Taiwan Big 5", "CP950",""}, - {0x79, 949, "Hangul (Wansung)", "CP949",""}, - {0x7A, 936, "PRC GBK", "CP936","GBK"}, - {0x7B, 932, "Japanese Shift-JIS", "CP932",""}, - {0x7C, 874, "Thai Windows/MS-DOS", "WINDOWS-874","WIN874"}, - {0x86, 737, "Greek OEM", "CP737",""}, - {0x87, 852, "Slovenian OEM", "CP852",""}, - {0x88, 857, "Turkish OEM", "CP857",""}, - {0xC8, 1250, "Eastern European Windows", "WINDOWS-1250","WIN1250"}, - {0xC9, 1251, "Russian Windows", "WINDOWS-1251","WIN1251"}, - {0xCA, 1254, "Turkish Windows", "WINDOWS-1254","WIN1254"}, - {0xCB, 1253, "Greek Windows", "WINDOWS-1253","WIN1253"}, - {0xCC, 1257, "Baltic Window", "WINDOWS-1257","WIN1257"}, - {0xFF, 65001, "UTF-8", "UTF-8","UTF8"} -}; - /* Internal ring/point structures */ typedef struct struct_point @@ -128,58 +54,6 @@ int FindPolygons(SHPObject *obj, Ring ***Out); void ReleasePolygons(Ring **polys, int npolys); int GeneratePolygonGeometry(SHPLOADERSTATE *state, SHPObject *obj, char **geometry); -/* -* Code page info will come out of dbfopen as either a bare codepage number -* (e.g. 1256) or as "LDID/1234" from the DBF hreader. -*/ -static char * -codepage2encoding(const char *cpg) -{ - int cpglen; - int is_ldid = 0; - int num, i; - - /* Do nothing on nothing. */ - if ( ! cpg ) return NULL; - - /* Is this an LDID string? */ - /* If so, note it and move past the "LDID/" tag */ - cpglen = strlen(cpg); - if ( strstr(cpg, "LDID/") ) - { - if ( cpglen > 5 ) - { - cpg += 5; - is_ldid = 1; - } - else - { - return NULL; - } - } - - /* Read the number */ - num = atoi(cpg); - - /* Can we find this number in our lookup table? */ - for ( i = is_ldid ; i < num_code_pages; i++ ) - { - if ( is_ldid ) - { - if ( code_pages[i].ldid == num ) - return strdup(code_pages[i].iconv); - } - else - { - if ( code_pages[i].cpg == num ) - return strdup(code_pages[i].iconv); - } - } - - /* Didn't find a matching entry */ - return NULL; - -} /* Return allocated string containing UTF8 string converted from encoding fromcode */ static int diff --git a/loader/shpcommon.c b/loader/shpcommon.c index 8d252229d..e3f0b4126 100644 --- a/loader/shpcommon.c +++ b/loader/shpcommon.c @@ -196,3 +196,86 @@ colmap_read(const char *filename, colmap *map, char *errbuf, size_t errbuflen) return 1; } +/* +* Code page info will come out of dbfopen as either a bare codepage number +* (e.g. 1256) or as "LDID/1234" from the DBF hreader. We want to look up +* the equivalent iconv encoding string so we can use iconv to transcode +* the data into UTF8 +*/ +char * +codepage2encoding(const char *cpg) +{ + int cpglen; + int is_ldid = 0; + int num, i; + + /* Do nothing on nothing. */ + if ( ! cpg ) return NULL; + + /* Is this an LDID string? */ + /* If so, note it and move past the "LDID/" tag */ + cpglen = strlen(cpg); + if ( strstr(cpg, "LDID/") ) + { + if ( cpglen > 5 ) + { + cpg += 5; + is_ldid = 1; + } + else + { + return NULL; + } + } + + /* Read the number */ + num = atoi(cpg); + + /* Can we find this number in our lookup table? */ + for ( i = is_ldid ; i < num_code_pages; i++ ) + { + if ( is_ldid ) + { + if ( code_pages[i].ldid == num ) + return strdup(code_pages[i].iconv); + } + else + { + if ( code_pages[i].cpg == num ) + return strdup(code_pages[i].iconv); + } + } + + /* Didn't find a matching entry */ + return NULL; + +} + +/* +* In the case where data is coming out of the database in some wierd encoding +* we want to look up the appropriate code page entry to feed to DBFCreateEx +*/ +char * +encoding2codepage(const char *encoding) +{ + int i; + for ( i = 0; i < num_code_pages; i++ ) + { + if ( strcasecmp(encoding, code_pages[i].pg) == 0 ) + { + if ( code_pages[i].ldid == 0xFF ) + { + return strdup("UTF-8"); + } + else + { + char *codepage = NULL; + asprintf(&codepage, "LDID/%d", code_pages[i].ldid); + return codepage; + } + } + } + + /* OK, we give up, pretend it's UTF8 */ + return strdup("UTF-8"); +} diff --git a/loader/shpcommon.h b/loader/shpcommon.h index 89a5beb03..817c7b211 100644 --- a/loader/shpcommon.h +++ b/loader/shpcommon.h @@ -24,6 +24,87 @@ #define _(String) String #endif + + +typedef struct +{ + int ldid; + int cpg; + char *desc; + char *iconv; + char *pg; +} code_page_entry; + +static int num_code_pages = 60; + +/* http://www.autopark.ru/ASBProgrammerGuide/DBFSTRUC.HTM */ +/* http://resources.arcgis.com/fr/content/kbase?fa=articleShow&d=21106 */ + +static code_page_entry code_pages[] = { + {0x01, 437, "U.S. MS-DOS", "CP437",""}, + {0x02, 850, "International MS-DOS", "CP850",""}, + {0x03, 1252, "Window ANSI", "WINDOWS-1252","WIN1252"}, + {0x08, 865, "Danish OEM", "CP865",""}, + {0x09, 437, "Dutch OEM", "CP437",""}, + {0x0A, 850, "Dutch OEM*", "CP850",""}, + {0x0B, 437, "Finnish OEM", "CP437",""}, + {0x0D, 437, "French OEM", "CP437",""}, + {0x0E, 850, "French OEM*", "CP850",""}, + {0x0F, 437, "German OEM", "CP437",""}, + {0x10, 850, "German OEM*", "CP850",""}, + {0x11, 437, "Italian OEM", "CP437",""}, + {0x12, 850, "Italian OEM*", "CP850",""}, + {0x13, 932, "Japanese Shift-JIS", "CP932","SJIS"}, + {0x14, 850, "Spanish OEM*", "CP850",""}, + {0x15, 437, "Swedish OEM", "CP437",""}, + {0x16, 850, "Swedish OEM*", "CP850",""}, + {0x17, 865, "Norwegian OEM", "CP865",""}, + {0x18, 437, "Spanish OEM", "CP865",""}, + {0x19, 437, "English OEM (Britain)", "CP437",""}, + {0x1A, 850, "English OEM (Britain)*", "CP850",""}, + {0x1B, 437, "English OEM (U.S.)", "CP437",""}, + {0x1C, 863, "French OEM (Canada)", "CP863",""}, + {0x1D, 850, "French OEM*", "CP850",""}, + {0x1F, 852, "Czech OEM", "CP852",""}, + {0x22, 852, "Hungarian OEM", "CP852",""}, + {0x23, 852, "Polish OEM", "CP852",""}, + {0x24, 860, "Portugese OEM", "CP860",""}, + {0x25, 850, "Potugese OEM*", "CP850",""}, + {0x26, 866, "Russian OEM", "WINDOWS-866","WIN866"}, + {0x37, 850, "English OEM (U.S.)*", "CP850",""}, + {0x40, 852, "Romanian OEM", "CP852",""}, + {0x4D, 936, "Chinese GBK (PRC)", "CP936",""}, + {0x4E, 949, "Korean (ANSI/OEM)", "CP949",""}, + {0x4F, 950, "Chinese Big 5 (Taiwan)", "CP950","BIG5"}, + {0x50, 874, "Thai (ANSI/OEM)", "WIN874",""}, + {0x57, 1252, "ANSI", "WINDOWS-1252",""}, + {0x58, 1252, "Western European ANSI", "WINDOWS-1252",""}, + {0x59, 1252, "Spanish ANSI", "WINDOWS-1252",""}, + {0x64, 852, "Eastern European MS-DOS", "CP852",""}, + {0x65, 866, "Russian MS-DOS", "CP866",""}, + {0x66, 865, "Nordic MS-DOS", "CP865",""}, + {0x67, 861, "Icelandic MS-DOS", "",""}, + {0x6A, 737, "Greek MS-DOS (437G)", "CP737",""}, + {0x6B, 857, "Turkish MS-DOS", "CP857",""}, + {0x6C, 863, "French-Canadian MS-DOS", "CP863",""}, + {0x78, 950, "Taiwan Big 5", "CP950",""}, + {0x79, 949, "Hangul (Wansung)", "CP949",""}, + {0x7A, 936, "PRC GBK", "CP936","GBK"}, + {0x7B, 932, "Japanese Shift-JIS", "CP932",""}, + {0x7C, 874, "Thai Windows/MS-DOS", "WINDOWS-874","WIN874"}, + {0x86, 737, "Greek OEM", "CP737",""}, + {0x87, 852, "Slovenian OEM", "CP852",""}, + {0x88, 857, "Turkish OEM", "CP857",""}, + {0xC8, 1250, "Eastern European Windows", "WINDOWS-1250","WIN1250"}, + {0xC9, 1251, "Russian Windows", "WINDOWS-1251","WIN1251"}, + {0xCA, 1254, "Turkish Windows", "WINDOWS-1254","WIN1254"}, + {0xCB, 1253, "Greek Windows", "WINDOWS-1253","WIN1253"}, + {0xCC, 1257, "Baltic Window", "WINDOWS-1257","WIN1257"}, + {0xFF, 65001, "UTF-8", "UTF-8","UTF8"} +}; + + + typedef struct shp_connection_state { /* PgSQL username to log in with */ @@ -100,4 +181,7 @@ const char *colmap_dbf_by_pg(colmap *map, const char *pgname); const char *colmap_pg_by_dbf(colmap *map, const char *dbfname); +char *codepage2encoding(const char *cpg); +char *encoding2codepage(const char *encoding); + #endif diff --git a/regress/dumper/literalsrid_expected.dbf b/regress/dumper/literalsrid_expected.dbf index df11d721a..cafd86349 100644 Binary files a/regress/dumper/literalsrid_expected.dbf and b/regress/dumper/literalsrid_expected.dbf differ diff --git a/regress/dumper/realtable_expected.dbf b/regress/dumper/realtable_expected.dbf index 0d2ac5037..7f420e416 100644 Binary files a/regress/dumper/realtable_expected.dbf and b/regress/dumper/realtable_expected.dbf differ