PR/456, use utf-7 charset in mime printing for utf-7 files.

author Christos Zoulas <christos@zoulas.com>

Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)

committer Christos Zoulas <christos@zoulas.com>

Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)
author Christos Zoulas <christos@zoulas.com>
Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)
committer Christos Zoulas <christos@zoulas.com>
Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)
diff --git a/ChangeLog b/ChangeLog

index 5cd5c204d783059e6df3c8b16d5c05e343d0ad20..91f10a81ed39c2fc278bc2897915b3639cd5f8b6 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,8 @@
  
+2015-06-03  16:00  Christos Zoulas <christos@zoulas.com>
+
+       * PR/455: Add utf-7 encoding
+
  2015-06-03  14:30  Christos Zoulas <christos@zoulas.com>
  
         * PR/455: Implement -Z, look inside, but don't report on compression
diff --git a/doc/file.man b/doc/file.man

index 69b9a80d86b74f06b00ccc20681f99a071452568..01e1fc2ef5e3b1482929fbc2d29df023e9fffccd 100644 (file)
--- a/doc/file.man
+++ b/doc/file.man
@@ -1,4 +1,4 @@
-.\" $File: file.man,v 1.115 2015/05/29 14:27:31 christos Exp $
+.\" $File: file.man,v 1.116 2015/06/03 18:21:24 christos Exp $
  .Dd June 3, 2015
  .Dt FILE __CSECTION__
  .Os
@@ -597,6 +597,9 @@ The handling of
  and printing \e012- between entries is clumsy and complicated; refactor
  and centralize.
  .Pp
+Some of the encoding logic is hard-coded in encoding.c and can be moved
+to the magic files if we had a !:charset annotation
+.Pp
  Continue to squash all magic bugs.
  See Debian BTS for a good source.
  .Pp
diff --git a/src/encoding.c b/src/encoding.c

index a32d6b381e5e76e5f52fcce1f135dc2ca1c9ba4b..7dbbe7b180bdef64e7251d7e44d6c75c6a6b2679 100644 (file)
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -35,7 +35,7 @@
  #include "file.h"
  
  #ifndef        lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.11 2015/01/24 23:22:25 christos Exp $")
  #endif /* lint */
  
  #include "magic.h"
@@ -47,6 +47,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $")
  private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
      size_t *);
+private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
  private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
@@ -88,9 +89,15 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
         }
  
         if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
-               DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
-               *code = "ASCII";
-               *code_mime = "us-ascii";
+               if (looks_utf7(buf, nbytes, *ubuf, ulen)) {
+                       DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
+                       *code = "UTF-7 Unicode";
+                       *code_mime = "utf-7";
+               } else {
+                       DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
+                       *code = "ASCII";
+                       *code_mime = "us-ascii";
+               }
         } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
                 DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
                 *code = "UTF-8 Unicode (with BOM)";
@@ -371,6 +378,25 @@ looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
                 return -1;
  }
  
+private int
+looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+{
+       if (ubuf)
+               *ulen = 0;
+       if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
+               switch (buf[3]) {
+               case '8':
+               case '9':
+               case '+':
+               case '/':
+                       return 1;
+               default:
+                       return -1;
+               }
+       else
+               return -1;
+}
+
  private int
  looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
      size_t *ulen)
author	Christos Zoulas <christos@zoulas.com>
	Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)
committer	Christos Zoulas <christos@zoulas.com>
	Wed, 3 Jun 2015 19:51:27 +0000 (19:51 +0000)
ChangeLog		patch \| blob \| history
doc/file.man		patch \| blob \| history
src/encoding.c		patch \| blob \| history