From eb9ec0dc08a3470252dd85ed798ff74b49784ab9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Fran=C3=A7ois=20Pinard?= Date: Thu, 13 Mar 2008 13:15:34 -0400 Subject: [PATCH] Java charset --- doc/recode.info | 214 +++++++++++++++++++++++++--------------------- doc/recode.texi | 26 +++++- doc/stamp-vti | 2 +- doc/version.texi | 2 +- src/ChangeLog | 5 ++ src/Makefile.am | 8 +- src/Makefile.in | 46 +++++----- src/decsteps.h | 2 + src/inisteps.h | 1 + src/java.c | 116 +++++++++++++++++++++++++ src/tersteps.h | 1 + tests/ChangeLog | 4 + tests/t40_java.py | 31 +++++++ 13 files changed, 332 insertions(+), 126 deletions(-) create mode 100644 src/java.c create mode 100644 tests/t40_java.py diff --git a/doc/recode.info b/doc/recode.info index cc49414..f32cf2e 100644 --- a/doc/recode.info +++ b/doc/recode.info @@ -134,6 +134,7 @@ Various other charsets * Vietnamese:: Vietnamese charsets * African:: African charsets * Others:: Cyrillic and other charsets +* Java:: Java code * Texte:: Easy French conventions * Mule:: Mule as a multiplexed charset @@ -3896,9 +3897,10 @@ to handle these particularly well for French texts. * HTML:: World Wide Web representations * LaTeX:: LaTeX macro calls * Texinfo:: GNU project documentation files -* Vietnamese:: +* Vietnamese:: Vietnamese charsets * African:: African charsets -* Others:: +* Others:: Cyrillic and other charsets +* Java:: Java code * Texte:: Easy French conventions * Mule:: Mule as a multiplexed charset @@ -4124,7 +4126,7 @@ Recode under the name `AFRL1-101-BPI_OCIL'. Accepted aliases are `t-fra' and `t-francais'.  -File: recode.info, Node: Others, Next: Texte, Prev: African, Up: Miscellaneous +File: recode.info, Node: Others, Next: Java, Prev: African, Up: Miscellaneous 12.6 Cyrillic and other charsets ================================ @@ -4159,9 +4161,27 @@ everybody, and this section will merely disappear. This charset is available under the name `KOI-8_CS2'.  -File: recode.info, Node: Texte, Next: Mule, Prev: Others, Up: Miscellaneous +File: recode.info, Node: Java, Next: Texte, Prev: Others, Up: Miscellaneous + +12.7 Java code +============== + +This charset is available under the name `Java', and should be +considered experimental for now. + + ASCII characters represent themselves. Character outside ASCII are +coded as `\uNNNN', where `NNNN' stands for the four-digit hexadecimal +value of the character within Unicode. The canonical representation +uses lower case for the `u' prefix and for the hexadecimal digits, yet +Recode also accepts upper case. + + There is currently no attempt to distinguish Java comments from Java +strings while the recoding goes, and this may be corrected some day. + + +File: recode.info, Node: Texte, Next: Mule, Prev: Java, Up: Miscellaneous -12.7 Easy French conventions +12.8 Easy French conventions ============================ This charset is available in Recode under the name `Texte' and has @@ -4284,7 +4304,7 @@ as being fairly evident.  File: recode.info, Node: Mule, Prev: Texte, Up: Miscellaneous -12.8 Mule as a multiplexed charset +12.9 Mule as a multiplexed charset ================================== This version of Recode barely starts supporting multiplexed or @@ -5872,6 +5892,7 @@ by Recode, and their aliases. * isoir91: Tabular. (line 27) * isoir92: Tabular. (line 27) * IT, aliases and source: Tabular. (line 520) +* Java: Java. (line 6) * JIS_C6220-1969: Tabular. (line 524) * JIS_C6220-1969-jp, aliases and source: Tabular. (line 524) * JIS_C6220-1969-ro, aliases and source: Tabular. (line 528) @@ -6083,95 +6104,96 @@ by Recode, and their aliases.  Tag Table: Node: Top1148 -Node: Tutorial5575 -Node: Introduction9803 -Node: Charset overview14037 -Node: Surface overview15842 -Node: Contributing17310 -Ref: Contributing-Footnote-119544 -Node: Invoking recode19678 -Node: Synopsis20633 -Ref: Synopsis-Footnote-123073 -Node: Requests23370 -Ref: Requests-Footnote-129260 -Ref: Requests-Footnote-229327 -Ref: Requests-Footnote-329505 -Node: Listings29964 -Ref: Listings-Footnote-141113 -Node: Recoding41436 -Node: Reversibility44257 -Ref: Reversibility-Footnote-152712 -Node: Sequencing52849 -Node: Mixed55293 -Node: Emacs58661 -Node: Debugging59695 -Node: Library63965 -Node: Outer level65319 -Node: Request level72193 -Node: Task level83140 -Node: Charset level93562 -Node: Errors94404 -Ref: Errors-Footnote-199250 -Ref: Errors-Footnote-299364 -Node: Universal99725 -Ref: Universal-Footnote-1102837 -Ref: Universal-Footnote-2102903 -Node: UCS-2103116 -Node: UCS-4105642 -Node: UTF-7106182 -Node: UTF-8106777 -Node: UTF-16111082 -Node: count-characters112230 -Node: dump-with-names112901 -Node: iconv115450 -Node: Tabular118881 -Node: ASCII misc141094 -Node: ASCII141460 -Node: ISO 8859142276 -Node: ASCII-BS144570 -Node: flat146407 -Node: IBM and MS147078 -Node: EBCDIC147622 -Node: IBM-PC149718 -Ref: IBM-PC-Footnote-1151832 -Node: Icon-QNX151991 -Node: CDC152416 -Node: Display Code154097 -Ref: Display Code-Footnote-1156378 -Node: CDC-NOS156583 -Node: Bang-Bang158545 -Node: Micros160474 -Node: Apple-Mac160857 -Node: AtariST162891 -Node: Miscellaneous163877 -Node: HTML164610 -Node: LaTeX170599 -Node: Texinfo171373 -Node: Vietnamese172145 -Node: African173121 -Node: Others174471 -Node: Texte175925 -Ref: Texte-Footnote-1180475 -Ref: Texte-Footnote-2180555 -Ref: Texte-Footnote-3181030 -Node: Mule181127 -Ref: Mule-Footnote-1182908 -Node: Surfaces183427 -Ref: Surfaces-Footnote-1186846 -Node: Permutations186950 -Node: End lines187791 -Node: MIME189992 -Node: Dump191179 -Node: Test195349 -Node: Internals197827 -Node: Main flow199055 -Node: New charsets202158 -Node: New surfaces206696 -Node: Design207422 -Ref: Design-Footnote-1216588 -Node: Concept Index216692 -Node: Option Index231727 -Node: Library Index234580 -Node: Charset and Surface Index239155 +Node: Tutorial5609 +Node: Introduction9837 +Node: Charset overview14071 +Node: Surface overview15876 +Node: Contributing17344 +Ref: Contributing-Footnote-119578 +Node: Invoking recode19712 +Node: Synopsis20667 +Ref: Synopsis-Footnote-123107 +Node: Requests23404 +Ref: Requests-Footnote-129294 +Ref: Requests-Footnote-229361 +Ref: Requests-Footnote-329539 +Node: Listings29998 +Ref: Listings-Footnote-141147 +Node: Recoding41470 +Node: Reversibility44291 +Ref: Reversibility-Footnote-152746 +Node: Sequencing52883 +Node: Mixed55327 +Node: Emacs58695 +Node: Debugging59729 +Node: Library63999 +Node: Outer level65353 +Node: Request level72227 +Node: Task level83174 +Node: Charset level93596 +Node: Errors94438 +Ref: Errors-Footnote-199284 +Ref: Errors-Footnote-299398 +Node: Universal99759 +Ref: Universal-Footnote-1102871 +Ref: Universal-Footnote-2102937 +Node: UCS-2103150 +Node: UCS-4105676 +Node: UTF-7106216 +Node: UTF-8106811 +Node: UTF-16111116 +Node: count-characters112264 +Node: dump-with-names112935 +Node: iconv115484 +Node: Tabular118915 +Node: ASCII misc141128 +Node: ASCII141494 +Node: ISO 8859142310 +Node: ASCII-BS144604 +Node: flat146441 +Node: IBM and MS147112 +Node: EBCDIC147656 +Node: IBM-PC149752 +Ref: IBM-PC-Footnote-1151866 +Node: Icon-QNX152025 +Node: CDC152450 +Node: Display Code154131 +Ref: Display Code-Footnote-1156412 +Node: CDC-NOS156617 +Node: Bang-Bang158579 +Node: Micros160508 +Node: Apple-Mac160891 +Node: AtariST162925 +Node: Miscellaneous163911 +Node: HTML164748 +Node: LaTeX170737 +Node: Texinfo171511 +Node: Vietnamese172283 +Node: African173259 +Node: Others174609 +Node: Java176062 +Node: Texte176729 +Ref: Texte-Footnote-1181277 +Ref: Texte-Footnote-2181357 +Ref: Texte-Footnote-3181832 +Node: Mule181929 +Ref: Mule-Footnote-1183710 +Node: Surfaces184229 +Ref: Surfaces-Footnote-1187648 +Node: Permutations187752 +Node: End lines188593 +Node: MIME190794 +Node: Dump191981 +Node: Test196151 +Node: Internals198629 +Node: Main flow199857 +Node: New charsets202960 +Node: New surfaces207498 +Node: Design208224 +Ref: Design-Footnote-1217390 +Node: Concept Index217494 +Node: Option Index232529 +Node: Library Index235382 +Node: Charset and Surface Index239957  End Tag Table diff --git a/doc/recode.texi b/doc/recode.texi index 0fe7eec..2e8e1cf 100644 --- a/doc/recode.texi +++ b/doc/recode.texi @@ -178,6 +178,7 @@ Various other charsets * Vietnamese:: Vietnamese charsets * African:: African charsets * Others:: Cyrillic and other charsets +* Java:: Java code * Texte:: Easy French conventions * Mule:: Mule as a multiplexed charset @@ -3760,9 +3761,10 @@ knows how to handle these particularly well for French texts. * HTML:: World Wide Web representations * LaTeX:: LaTeX macro calls * Texinfo:: GNU project documentation files -* Vietnamese:: +* Vietnamese:: Vietnamese charsets * African:: African charsets -* Others:: +* Others:: Cyrillic and other charsets +* Java:: Java code * Texte:: Easy French conventions * Mule:: Mule as a multiplexed charset @end menu @@ -4056,7 +4058,7 @@ African charsets in this series. This charset is available in Recode under the name @code{AFRL1-101-BPI_OCIL}. Accepted aliases are @code{t-fra} and @code{t-francais}. -@node Others, Texte, African, Miscellaneous +@node Others, Java, African, Miscellaneous @section Cyrillic and other charsets @cindex Cyrillic charsets @@ -4097,7 +4099,23 @@ as an accepted alias. This charset is available under the name @code{KOI-8_CS2}. @end table -@node Texte, Mule, Others, Miscellaneous +@node Java, Texte, Others, Miscellaneous +@section Java code + +@tindex Java +This charset is available under the name @code{Java}, and should be +considered experimental for now. + +ASCII characters represent themselves. Character outside ASCII are +coded as @samp{\uNNNN}, where @samp{NNNN} stands for the four-digit +hexadecimal value of the character within Unicode. The canonical +representation uses lower case for the @samp{u} prefix and for the +hexadecimal digits, yet Recode also accepts upper case. + +There is currently no attempt to distinguish Java comments from Java +strings while the recoding goes, and this may be corrected some day. + +@node Texte, Mule, Java, Miscellaneous @section Easy French conventions @tindex Texte diff --git a/doc/stamp-vti b/doc/stamp-vti index 58f7fec..265e4b2 100644 --- a/doc/stamp-vti +++ b/doc/stamp-vti @@ -1,4 +1,4 @@ -@set UPDATED 9 March 2008 +@set UPDATED 12 March 2008 @set UPDATED-MONTH March 2008 @set EDITION 3.7-beta2 @set VERSION 3.7-beta2 diff --git a/doc/version.texi b/doc/version.texi index 58f7fec..265e4b2 100644 --- a/doc/version.texi +++ b/doc/version.texi @@ -1,4 +1,4 @@ -@set UPDATED 9 March 2008 +@set UPDATED 12 March 2008 @set UPDATED-MONTH March 2008 @set EDITION 3.7-beta2 @set VERSION 3.7-beta2 diff --git a/src/ChangeLog b/src/ChangeLog index 3f9fce6..3bc492c 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,8 @@ +2008-03-13 François Pinard + + * java.c: New. + * Makefile.am: Adjusted. + 2008-03-12 François Pinard * recodext.h: Set bit field with, for ignore, from 2 to 1. diff --git a/src/Makefile.am b/src/Makefile.am index 48a0ca0..1f6f1ab 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -31,10 +31,12 @@ EXTRA_DIST = stamp-steps stamp-strip $(L_STEPS) mergelex.py $(MANS) CLEANFILES = iconvdecl.h C_STEPS = african.c afrtran.c applemac.c atarist.c bangbang.c cdcnos.c \ -ebcdic.c ibmpc.c iconqnx.c lat1asci.c mule.c strip-data.c testdump.c \ -ucs.c utf16.c utf7.c utf8.c varia.c vn.c $(C_FALLBACKS) $(C_SURFACES) \ -merged.c +ebcdic.c ibmpc.c iconqnx.c lat1asci.c java.c mule.c strip-data.c \ +testdump.c ucs.c utf16.c utf7.c utf8.c varia.c vn.c \ +$(C_FALLBACKS) $(C_SURFACES) merged.c + OLD_C_STEPS = next.c + L_STEPS = ascilat1.l $(L_FALLBACKS) $(L_SURFACES) H_FALLBACKS = diff --git a/src/Makefile.in b/src/Makefile.in index a5b8090..690019b 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -90,9 +90,10 @@ am__objects_2 = base64$U.lo dump$U.lo endline$U.lo permut$U.lo \ quoted$U.lo am__objects_3 = african$U.lo afrtran$U.lo applemac$U.lo atarist$U.lo \ bangbang$U.lo cdcnos$U.lo ebcdic$U.lo ibmpc$U.lo iconqnx$U.lo \ - lat1asci$U.lo mule$U.lo strip-data$U.lo testdump$U.lo ucs$U.lo \ - utf16$U.lo utf7$U.lo utf8$U.lo varia$U.lo vn$U.lo \ - $(am__objects_1) $(am__objects_2) merged$U.lo + lat1asci$U.lo java$U.lo mule$U.lo strip-data$U.lo \ + testdump$U.lo ucs$U.lo utf16$U.lo utf7$U.lo utf8$U.lo \ + varia$U.lo vn$U.lo $(am__objects_1) $(am__objects_2) \ + merged$U.lo am_librecode_la_OBJECTS = argmatch$U.lo charname$U.lo combine$U.lo \ exitfail$U.lo fr-charname$U.lo hash$U.lo iconv$U.lo \ localcharset$U.lo names$U.lo outer$U.lo quotearg$U.lo \ @@ -272,9 +273,9 @@ xalloc.h $(H_FALLBACKS) $(H_SURFACES) EXTRA_DIST = stamp-steps stamp-strip $(L_STEPS) mergelex.py $(MANS) CLEANFILES = iconvdecl.h C_STEPS = african.c afrtran.c applemac.c atarist.c bangbang.c cdcnos.c \ -ebcdic.c ibmpc.c iconqnx.c lat1asci.c mule.c strip-data.c testdump.c \ -ucs.c utf16.c utf7.c utf8.c varia.c vn.c $(C_FALLBACKS) $(C_SURFACES) \ -merged.c +ebcdic.c ibmpc.c iconqnx.c lat1asci.c java.c mule.c strip-data.c \ +testdump.c ucs.c utf16.c utf7.c utf8.c varia.c vn.c \ +$(C_FALLBACKS) $(C_SURFACES) merged.c OLD_C_STEPS = next.c L_STEPS = ascilat1.l $(L_FALLBACKS) $(L_SURFACES) @@ -452,6 +453,7 @@ mostlyclean-kr: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ibmpc$U.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iconqnx$U.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iconv$U.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/java$U.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lat1asci$U.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lat1ltex$U.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lat1txte$U.Plo@am__quote@ @@ -546,6 +548,8 @@ iconqnx_.c: iconqnx.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/iconqnx.c; then echo $(srcdir)/iconqnx.c; else echo iconqnx.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ iconv_.c: iconv.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/iconv.c; then echo $(srcdir)/iconv.c; else echo iconv.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ +java_.c: java.c $(ANSI2KNR) + $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/java.c; then echo $(srcdir)/java.c; else echo java.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ lat1asci_.c: lat1asci.c $(ANSI2KNR) $(CPP) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) `if test -f $(srcdir)/lat1asci.c; then echo $(srcdir)/lat1asci.c; else echo lat1asci.c; fi` | sed 's/^# \([0-9]\)/#line \1/' | $(ANSI2KNR) > $@ || rm -f $@ lat1ltex_.c: lat1ltex.c $(ANSI2KNR) @@ -614,21 +618,21 @@ endline_.$(OBJEXT) endline_.lo exitfail_.$(OBJEXT) exitfail_.lo \ flat_.$(OBJEXT) flat_.lo fr-charname_.$(OBJEXT) fr-charname_.lo \ freeze_.$(OBJEXT) freeze_.lo hash_.$(OBJEXT) hash_.lo html_.$(OBJEXT) \ html_.lo ibmpc_.$(OBJEXT) ibmpc_.lo iconqnx_.$(OBJEXT) iconqnx_.lo \ -iconv_.$(OBJEXT) iconv_.lo lat1asci_.$(OBJEXT) lat1asci_.lo \ -lat1ltex_.$(OBJEXT) lat1ltex_.lo lat1txte_.$(OBJEXT) lat1txte_.lo \ -localcharset_.$(OBJEXT) localcharset_.lo main_.$(OBJEXT) main_.lo \ -merged_.$(OBJEXT) merged_.lo mixed_.$(OBJEXT) mixed_.lo \ -mule_.$(OBJEXT) mule_.lo names_.$(OBJEXT) names_.lo outer_.$(OBJEXT) \ -outer_.lo permut_.$(OBJEXT) permut_.lo quotearg_.$(OBJEXT) \ -quotearg_.lo quoted_.$(OBJEXT) quoted_.lo recode_.$(OBJEXT) recode_.lo \ -request_.$(OBJEXT) request_.lo rfc1345_.$(OBJEXT) rfc1345_.lo \ -strip-data_.$(OBJEXT) strip-data_.lo strip-pool_.$(OBJEXT) \ -strip-pool_.lo task_.$(OBJEXT) task_.lo testdump_.$(OBJEXT) \ -testdump_.lo texinfo_.$(OBJEXT) texinfo_.lo ucs_.$(OBJEXT) ucs_.lo \ -utf16_.$(OBJEXT) utf16_.lo utf7_.$(OBJEXT) utf7_.lo utf8_.$(OBJEXT) \ -utf8_.lo varia_.$(OBJEXT) varia_.lo vn_.$(OBJEXT) vn_.lo \ -xalloc-die_.$(OBJEXT) xalloc-die_.lo xmalloc_.$(OBJEXT) xmalloc_.lo : \ -$(ANSI2KNR) +iconv_.$(OBJEXT) iconv_.lo java_.$(OBJEXT) java_.lo \ +lat1asci_.$(OBJEXT) lat1asci_.lo lat1ltex_.$(OBJEXT) lat1ltex_.lo \ +lat1txte_.$(OBJEXT) lat1txte_.lo localcharset_.$(OBJEXT) \ +localcharset_.lo main_.$(OBJEXT) main_.lo merged_.$(OBJEXT) merged_.lo \ +mixed_.$(OBJEXT) mixed_.lo mule_.$(OBJEXT) mule_.lo names_.$(OBJEXT) \ +names_.lo outer_.$(OBJEXT) outer_.lo permut_.$(OBJEXT) permut_.lo \ +quotearg_.$(OBJEXT) quotearg_.lo quoted_.$(OBJEXT) quoted_.lo \ +recode_.$(OBJEXT) recode_.lo request_.$(OBJEXT) request_.lo \ +rfc1345_.$(OBJEXT) rfc1345_.lo strip-data_.$(OBJEXT) strip-data_.lo \ +strip-pool_.$(OBJEXT) strip-pool_.lo task_.$(OBJEXT) task_.lo \ +testdump_.$(OBJEXT) testdump_.lo texinfo_.$(OBJEXT) texinfo_.lo \ +ucs_.$(OBJEXT) ucs_.lo utf16_.$(OBJEXT) utf16_.lo utf7_.$(OBJEXT) \ +utf7_.lo utf8_.$(OBJEXT) utf8_.lo varia_.$(OBJEXT) varia_.lo \ +vn_.$(OBJEXT) vn_.lo xalloc-die_.$(OBJEXT) xalloc-die_.lo \ +xmalloc_.$(OBJEXT) xmalloc_.lo : $(ANSI2KNR) mostlyclean-libtool: -rm -f *.lo diff --git a/src/decsteps.h b/src/decsteps.h index d400e7e..222c034 100644 --- a/src/decsteps.h +++ b/src/decsteps.h @@ -8,6 +8,7 @@ bool module_ebcdic PARAMS ((struct recode_outer *)); bool module_ibmpc PARAMS ((struct recode_outer *)); bool module_iconqnx PARAMS ((struct recode_outer *)); bool module_latin1_ascii PARAMS ((struct recode_outer *)); +bool module_java PARAMS ((struct recode_outer *)); bool module_mule PARAMS ((struct recode_outer *)); bool module_strips PARAMS ((struct recode_outer *)); bool module_testdump PARAMS ((struct recode_outer *)); @@ -41,6 +42,7 @@ void delmodule_ebcdic PARAMS ((struct recode_outer *)); void delmodule_ibmpc PARAMS ((struct recode_outer *)); void delmodule_iconqnx PARAMS ((struct recode_outer *)); void delmodule_latin1_ascii PARAMS ((struct recode_outer *)); +void delmodule_java PARAMS ((struct recode_outer *)); void delmodule_mule PARAMS ((struct recode_outer *)); void delmodule_strips PARAMS ((struct recode_outer *)); void delmodule_testdump PARAMS ((struct recode_outer *)); diff --git a/src/inisteps.h b/src/inisteps.h index a365a01..b276803 100644 --- a/src/inisteps.h +++ b/src/inisteps.h @@ -8,6 +8,7 @@ if (!module_ibmpc (outer)) return false; if (!module_iconqnx (outer)) return false; if (!module_latin1_ascii (outer)) return false; + if (!module_java (outer)) return false; if (!module_mule (outer)) return false; if (!module_strips (outer)) return false; if (!module_testdump (outer)) return false; diff --git a/src/java.c b/src/java.c new file mode 100644 index 0000000..f90dc0e --- /dev/null +++ b/src/java.c @@ -0,0 +1,116 @@ +/* Conversion of files between different charsets and surfaces. + Copyright © 2008 Free Software Foundation, Inc. + Contributed by François Pinard , 2008. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public License + as published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This library is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied warranty + of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the Recode Library; see the file `COPYING.LIB'. + If not, write to the Free Software Foundation, Inc., 59 Temple Place - + Suite 330, Boston, MA 02111-1307, USA. */ + +#include "common.h" + +static bool +transform_java_utf16 (RECODE_SUBTASK subtask) +{ + int character = get_byte (subtask); + + while (character != EOF) + if (character == '\\') + { + char buffer[6]; + char *cursor = buffer; + unsigned value = 0; + bool canonical = true; + + *cursor++ = character; + character = get_byte (subtask); + if (character == 'u' || character == 'U') + { + if (character == 'U') + canonical = false; + *cursor++ = character; + character = get_byte (subtask); + while (cursor < buffer + 6) + { + if (character >= '0' && character <= '9') + value = (value << 4) | (character - '0'); + else if (character >= 'A' && character <= 'F') + { + value = (value << 4) | (character - 'A' + 10); + canonical = false; + } + else if (character >= 'a' && character <= 'f') + value = (value << 4) | (character - 'a' + 10); + else + break; + *cursor++ = character; + character = get_byte (subtask); + } + if (cursor == buffer + 6) + { + if (!canonical) + RETURN_IF_NOGO (RECODE_NOT_CANONICAL, subtask); + put_ucs2 (value, subtask); + continue; + } + } + *cursor = '\0'; + for (cursor = buffer; *cursor; cursor++) + put_ucs2 (*cursor, subtask); + } + else + { + put_ucs2 (character, subtask); + character = get_byte (subtask); + } + + SUBTASK_RETURN (subtask); +} + +static bool +transform_utf16_java (RECODE_SUBTASK subtask) +{ + unsigned value; + + while (get_ucs2 (&value, subtask)) + if (value < 128) + put_byte (value, subtask); + else + { + char buffer[7]; + char *cursor; + + sprintf (buffer, "\\u%04x", value); + for (cursor = buffer; *cursor; cursor++) + put_byte (*cursor, subtask); + } + + SUBTASK_RETURN (subtask); +} + +bool +module_java (RECODE_OUTER outer) +{ + return + declare_single (outer, "UTF-16", "Java", + outer->quality_ucs2_to_variable, + NULL, transform_utf16_java) + && declare_single (outer, "Java", "UTF-16", + outer->quality_variable_to_ucs2, + NULL, transform_java_utf16); +} + +void +delmodule_java (RECODE_OUTER outer) +{ +} diff --git a/src/tersteps.h b/src/tersteps.h index 49058db..06baece 100644 --- a/src/tersteps.h +++ b/src/tersteps.h @@ -8,6 +8,7 @@ delmodule_ibmpc (outer); delmodule_iconqnx (outer); delmodule_latin1_ascii (outer); + delmodule_java (outer); delmodule_mule (outer); delmodule_strips (outer); delmodule_testdump (outer); diff --git a/tests/ChangeLog b/tests/ChangeLog index 99b8f2b..71bfdc6 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,7 @@ +2008-03-13 François Pinard + + * t40_java.py: New. + 2008-03-11 François Pinard * Recode.pyx: Adjusted for iconv_name. diff --git a/tests/t40_java.py b/tests/t40_java.py new file mode 100644 index 0000000..1b671ab --- /dev/null +++ b/tests/t40_java.py @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +import common +from common import setup_module, teardown_module + +input = '''\ +Dear =DEorvard=F0ur, + +=AB O=F9 qu'il r=E9side, =E0 N=EEmes ou m=EAme Capharna=FCm, tout Fran=E7ai= +s inscrit +au r=F4le payera son d=FB d=E8s avant No=EBl, qu'il soit na=EFf ou r=E2leur= +. =BB +''' + +output = '''\ +Dear \u00deorvard\u00f0ur, + +\u00ab O\u00f9 qu'il r\u00e9side, \u00e0 N\u00eemes ou m\u00eame Capharna\u00fcm, tout Fran\u00e7ais inscrit +au r\u00f4le payera son d\u00fb d\u00e8s avant No\u00ebl, qu'il soit na\u00eff ou r\u00e2leur. \u00bb +''' + +class Test: + + def test_1(self): + # Block of lines to JAVA. + common.request('l1/qp..java') + common.validate(input, output) + + def test_2(self): + # Block of lines to JAVA and back. + common.request('l1/qp..java') + common.validate_back(input) -- 2.40.0