]> granicus.if.org Git - python/commitdiff
Fix a delimiter detection problem in sniffer. Sniffing "a|b|c\r\n" was
authorSkip Montanaro <skip@pobox.com>
Fri, 30 Dec 2005 05:09:48 +0000 (05:09 +0000)
committerSkip Montanaro <skip@pobox.com>
Fri, 30 Dec 2005 05:09:48 +0000 (05:09 +0000)
returning 'a' as the delimiter.  It now returns '|', but not because I
understood better what the code was supposed to do.  Would someone that
understands the idea behind _guess_delimiter() (see its doc string) look to
see if my fallback choice is better than before or if it's just serendipity
that I picked the proper delimiter?

Lib/csv.py
Lib/test/test_csv.py

index 75163800834cba8917f35f673f91ea1a191b2783..f213854783eb67f1483e3abeeb94c70afefa8193 100644 (file)
@@ -152,10 +152,13 @@ class Sniffer:
 
         quotechar, delimiter, skipinitialspace = \
                    self._guess_quote_and_delimiter(sample, delimiters)
-        if delimiter is None:
+        if not delimiter:
             delimiter, skipinitialspace = self._guess_delimiter(sample,
                                                                 delimiters)
 
+        if not delimiter:
+            raise Error, "Could not determine delimiter"
+
         class dialect(Dialect):
             _name = "sniffed"
             lineterminator = '\r\n'
@@ -329,8 +332,12 @@ class Sniffer:
                                         data[0].count("%c " % d))
                     return (d, skipinitialspace)
 
-        # finally, just return the first damn character in the list
-        delim = delims.keys()[0]
+        # nothing else indicates a preference, pick the character that
+        # dominates(?)
+        items = [(v,k) for (k,v) in delims.items()]
+        items.sort()
+        delim = items[-1][1]
+
         skipinitialspace = (data[0].count(delim) ==
                             data[0].count("%c " % delim))
         return (delim, skipinitialspace)
index 0ad77ef09c6ef0217632da7f644d31cde072473f..8511a5ae97dfb28bcd9aeaf7dabf904c009a0739 100644 (file)
@@ -852,6 +852,8 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
 '''
 
     sample5 = "aaa\tbbb\r\nAAA\t\r\nBBB\t\r\n"
+    sample6 = "a|b|c\r\nd|e|f\r\n"
+    sample7 = "'a'|'b'|'c'\r\n'd'|e|f\r\n"
 
     def test_has_header(self):
         sniffer = csv.Sniffer()
@@ -882,6 +884,11 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
         self.assertEqual(dialect.delimiter, ";")
         dialect = sniffer.sniff(self.sample5)
         self.assertEqual(dialect.delimiter, "\t")
+        dialect = sniffer.sniff(self.sample6)
+        self.assertEqual(dialect.delimiter, "|")
+        dialect = sniffer.sniff(self.sample7)
+        self.assertEqual(dialect.delimiter, "|")
+        self.assertEqual(dialect.quotechar, "'")
 
 if not hasattr(sys, "gettotalrefcount"):
     if test_support.verbose: print "*** skipping leakage tests ***"