From 0f7059edfda841f40cd7bdbe1e24d53fa8518fa0 Mon Sep 17 00:00:00 2001 From: Christos Zoulas Date: Wed, 3 Jun 2015 19:51:27 +0000 Subject: [PATCH] PR/456, use utf-7 charset in mime printing for utf-7 files. --- ChangeLog | 4 ++++ doc/file.man | 5 ++++- src/encoding.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5cd5c204..91f10a81 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,8 @@ +2015-06-03 16:00 Christos Zoulas + + * PR/455: Add utf-7 encoding + 2015-06-03 14:30 Christos Zoulas * PR/455: Implement -Z, look inside, but don't report on compression diff --git a/doc/file.man b/doc/file.man index 69b9a80d..01e1fc2e 100644 --- a/doc/file.man +++ b/doc/file.man @@ -1,4 +1,4 @@ -.\" $File: file.man,v 1.115 2015/05/29 14:27:31 christos Exp $ +.\" $File: file.man,v 1.116 2015/06/03 18:21:24 christos Exp $ .Dd June 3, 2015 .Dt FILE __CSECTION__ .Os @@ -597,6 +597,9 @@ The handling of and printing \e012- between entries is clumsy and complicated; refactor and centralize. .Pp +Some of the encoding logic is hard-coded in encoding.c and can be moved +to the magic files if we had a !:charset annotation +.Pp Continue to squash all magic bugs. See Debian BTS for a good source. .Pp diff --git a/src/encoding.c b/src/encoding.c index a32d6b38..7dbbe7b1 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -35,7 +35,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.11 2015/01/24 23:22:25 christos Exp $") #endif /* lint */ #include "magic.h" @@ -47,6 +47,7 @@ FILE_RCSID("@(#)$File: encoding.c,v 1.10 2014/09/11 12:08:52 christos Exp $") private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, size_t *); +private int looks_utf7(const unsigned char *, size_t, unichar *, size_t *); private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); @@ -88,9 +89,15 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni } if (looks_ascii(buf, nbytes, *ubuf, ulen)) { - DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); - *code = "ASCII"; - *code_mime = "us-ascii"; + if (looks_utf7(buf, nbytes, *ubuf, ulen)) { + DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "UTF-7 Unicode"; + *code_mime = "utf-7"; + } else { + DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "ASCII"; + *code_mime = "us-ascii"; + } } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode (with BOM)"; @@ -371,6 +378,25 @@ looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, return -1; } +private int +looks_utf7(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) +{ + if (ubuf) + *ulen = 0; + if (nbytes > 4 && buf[0] == '+' && buf[1] == '/' && buf[2] == 'v') + switch (buf[3]) { + case '8': + case '9': + case '+': + case '/': + return 1; + default: + return -1; + } + else + return -1; +} + private int looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) -- 2.40.0