From 0f7059edfda841f40cd7bdbe1e24d53fa8518fa0 Mon Sep 17 00:00:00 2001
From: Christos Zoulas <christos@zoulas.com>
Date: Wed, 3 Jun 2015 19:51:27 +0000
Subject: [PATCH] PR/456, use utf-7 charset in mime printing for utf-7 files.

---
 ChangeLog      |  4 ++++
 doc/file.man   |  5 ++++-
 src/encoding.c | 34 ++++++++++++++++++++++++++++++----
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 5cd5c204..91f10a81 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,8 @@
 
+2015-06-03  16:00  Christos Zoulas <christos@zoulas.com>
+
+	* PR/455: Add utf-7 encoding
+
 2015-06-03  14:30  Christos Zoulas <christos@zoulas.com>
 
 	* PR/455: Implement -Z, look inside, but don't report on compression
diff --git a/doc/file.man b/doc/file.man
index 69b9a80d..01e1fc2e 100644
--- a/doc/file.man
+++ b/doc/file.man
@@ -1,4 +1,4 @@
-.\" $File: file.man,v 1.115 2015/05/29 14:27:31 christos Exp $
+.\" $File: file.man,v 1.116 2015/06/03 18:21:24 christos Exp $
 .Dd June 3, 2015
 .Dt FILE __CSECTION__
 .Os
@@ -597,6 +597,9 @@ The handling of
 and printing \e012- between entries is clumsy and complicated; refactor
 and centralize.
 .Pp
+Some of the encoding logic is hard-coded in encoding.c and can be moved
+to the magic files if we had a !:charset annotation
+.Pp
 Continue to squash all magic bugs.
 See Debian BTS for a good source.
 .Pp
diff --git a/src/encoding.c b/src/encoding.c
index a32d6b38..7dbbe7b1 100644
--- a/src/encoding.c
+++ b/src/encoding.c
@@ -35,7 +35,7 @@
 #include "file.h"
 
 #ifndef	lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.11 2015/01/24 23:22:25 christos Exp $")
 #endif	/* lint */
 
 #include "magic.h"
@@ -47,6 +47,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $")
 private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
     size_t *);
+private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
@@ -88,9 +89,15 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
 	}
 
 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
-		DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
-		*code = "ASCII";
-		*code_mime = "us-ascii";
+		if (looks_utf7(buf, nbytes, *ubuf, ulen)) {
+			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));
+			*code = "UTF-7 Unicode";
+			*code_mime = "utf-7";
+		} else {
+			DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
+			*code = "ASCII";
+			*code_mime = "us-ascii";
+		}
 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
 		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "UTF-8 Unicode (with BOM)";
@@ -371,6 +378,25 @@ looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
 		return -1;
 }
 
+private int
+looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+{
+	if (ubuf)
+		*ulen = 0;
+	if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v')
+		switch (buf[3]) {
+		case '8':
+		case '9':
+		case '+':
+		case '/':
+			return 1;
+		default:
+			return -1;
+		}
+	else
+		return -1;
+}
+
 private int
 looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
     size_t *ulen)
-- 
2.50.1