HTML::Entities and WinLatin1 NCRs [PATCH]
am 06.03.2006 22:09:53 von ChrisDHi --
I use the HTML::Entities module quite a bit and have really
appreciated its support for Unicode characters > 256 with Perl 5.8.
I do have one particular issue that crops up for me, and I thought
it might affects others as well, so I'm including a crude set of
patches with my "fix". In short, I have to support HTML documents
authored by a wide variety of people, and over time they've
accumulated numeric character references to the troublesome set
of characters between 128 and 159, mostly due to authors working
on Windows platforms. The same documents now may also have
character references to the Unicode code points for those characters.
Here's a simple example: "two em — dashes".
Now, in my particular situation, I sometimes want to decode
these entities to the same code point, so that, for example, I can
match strings against each other. At first I thought I might
get away with this:
$a = Encode::encode('utf8', $a); # force no utf8 flag
HTML::Entities::decode_entities($a);
$a = Encode::decode('cp1252', $a) unless (Encode::is_utf8($a));
But while that will turn "" into U+2014, it turns
"—" into U+0097 U+2014, which doesn't help.
So, I whacked into place a decode_entities_cp1252() function
that decodes any numeric characters references in the 128-159
range (except for a couple of undefined ones) to the UTF-8
equivalents. I'm positive there are nicer, more elegant, and
probably more flexible ways to do this, but lacking additional
time to experiment, this is where I stopped.
I pondered briefly trying to allow any character set mapping
to be applied to these characters, but concluded that using
WinLatin1 (a.k.a. Microsoft code page 1252) was actually sufficient
to match what most/all modern browsers do with these character
references. For example, this test page:
http://www.fifi.org/doc/lynx/test/c1.html
on my Linux Mozilla 1.7 browser displays matching columns of
glyphs, so Mozilla seems to be mapping these WinLatin1 characters
to their Unicode equivalents. Further, a test at our offices on
a variety of Windows and Mac browsers and systems didn't find any
that failed to display all these characters "properly".
Here's another gratuitous link:
http://home.earthlink.net/~bobbau/platforms/specialchars/
Well, here's my hacky patch for version 3.50, FWIW. Thanks
for the great modules!
Chris.
================================================ cp1252.patch
--- lib/HTML/Entities.pm.orig 2006-03-06 12:18:12.272613000 -0500
+++ lib/HTML/Entities.pm 2006-03-06 12:18:42.950260000 -0500
@@ -127,7 +127,7 @@
require Exporter;
@ISA = qw(Exporter);
-@EXPORT = qw(encode_entities decode_entities _decode_entities);
+@EXPORT = qw(encode_entities decode_entities decode_entities_cp1252
_decode_entities);
@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
$VERSION = sprintf("%d.%02d", q$Revision: 1.32 $ =~ /(\d+)\.(\d+)/);
--- MANIFEST.orig 2006-03-06 13:19:55.364120000 -0500
+++ MANIFEST 2006-03-06 13:20:13.055791000 -0500
@@ -40,6 +40,7 @@
t/dtext.t Test dtext decoding of entities
t/entities.t Test encoding/decoding of entities
t/entities2.t Test _decode_entities()
+t/entities3.t Test decode_entities_cp1252()
t/filter-methods.t Test ignore_tags, ignore_elements methods.
t/filter.t Test HTML::Filter
t/handler-eof.t Test invocation of $p->eof in handlers
--- Parser.xs.orig 2006-03-06 11:53:43.401973000 -0500
+++ Parser.xs 2006-03-06 12:05:53.599678000 -0500
@@ -489,7 +489,24 @@
ST(i) = sv_2mortal(newSVsv(ST(i)));
else if (SvREADONLY(ST(i)))
croak("Can't inline decode readonly string");
- decode_entities(aTHX_ ST(i), entity2char, 0);
+ decode_entities(aTHX_ ST(i), entity2char, 0, 0);
+ }
+ SP += items;
+
+void
+decode_entities_cp1252(...)
+ PREINIT:
+ int i;
+ HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE);
+ PPCODE:
+ if (GIMME_V == G_SCALAR && items > 1)
+ items = 1;
+ for (i = 0; i < items; i++) {
+ if (GIMME_V != G_VOID)
+ ST(i) = sv_2mortal(newSVsv(ST(i)));
+ else if (SvREADONLY(ST(i)))
+ croak("Can't inline decode readonly string");
+ decode_entities(aTHX_ ST(i), entity2char, 0, 1);
}
SP += items;
@@ -514,7 +531,7 @@
}
if (SvREADONLY(string))
croak("Can't inline decode readonly string");
- decode_entities(aTHX_ string, entities_hv, allow_unterminated);
+ decode_entities(aTHX_ string, entities_hv, allow_unterminated, 0);
bool
_probably_utf8_chunk(string)
--- hparser.c.orig 2006-03-06 15:31:33.228418000 -0500
+++ hparser.c 2006-03-06 15:31:44.401579000 -0500
@@ -465,7 +465,7 @@
if (p_state->utf8_mode)
sv_utf8_decode(attrval);
#endif
- decode_entities(aTHX_ attrval, p_state->entity2char, 0);
+ decode_entities(aTHX_ attrval, p_state->entity2char, 0, 0);
if (p_state->utf8_mode)
SvUTF8_off(attrval);
}
@@ -537,7 +537,7 @@
if (p_state->utf8_mode)
sv_utf8_decode(arg);
#endif
- decode_entities(aTHX_ arg, p_state->entity2char, 1);
+ decode_entities(aTHX_ arg, p_state->entity2char, 1, 0);
if (p_state->utf8_mode)
SvUTF8_off(arg);
}
--- util.c.orig 2006-03-06 14:07:52.686794000 -0500
+++ util.c 2006-03-06 14:07:55.647755000 -0500
@@ -11,6 +11,37 @@
#endif
+#ifdef UNICODE_HTML_PARSER
+#define CP1252_MAX_LEN 3
+
+static const int cp1252_len[32] =
+{
+ 3, 0, 3, 2, 3, 3, 3, 3, 2, 3, 2, 3, 2, 0, 2, 0,
+ 0, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 2, 0, 2, 2
+};
+
+static const unsigned char cp1252_utf8[32][CP1252_MAX_LEN] =
+{
+ { 0xE2, 0x82, 0xAC }, { 0, 0, 0 },
+ { 0xE2, 0x80, 0x9A }, { 0xC6, 0x92, 0 },
+ { 0xE2, 0x80, 0x9E }, { 0xE2, 0x80, 0xA6 },
+ { 0xE2, 0x80, 0xA0 }, { 0xE2, 0x80, 0xA1 },
+ { 0xCB, 0x86, 0 }, { 0xE2, 0x80, 0xB0 },
+ { 0xC5, 0xA0, 0 }, { 0xE2, 0x80, 0xB9 },
+ { 0xC5, 0x92, 0 }, { 0, 0, 0 },
+ { 0xC5, 0xBD, 0 }, { 0, 0, 0 },
+ { 0, 0, 0 }, { 0xE2, 0x80, 0x98 },
+ { 0xE2, 0x80, 0x99 }, { 0xE2, 0x80, 0x9C },
+ { 0xE2, 0x80, 0x9D }, { 0xE2, 0x80, 0xA2 },
+ { 0xE2, 0x80, 0x93 }, { 0xE2, 0x80, 0x94 },
+ { 0xCB, 0x9C, 0 }, { 0xE2, 0x84, 0xA2 },
+ { 0xC5, 0xA1, 0 }, { 0xE2, 0x80, 0xBA },
+ { 0xC5, 0x93, 0 }, { 0, 0, 0 },
+ { 0xC5, 0xBE, 0 }, { 0xC5, 0xB8, 0 }
+};
+#endif
+
+
EXTERN SV*
sv_lower(pTHX_ SV* sv)
{
@@ -63,7 +94,7 @@
}
EXTERN SV*
-decode_entities(pTHX_ SV* sv, HV* entity2char, bool allow_unterminated)
+decode_entities(pTHX_ SV* sv, HV* entity2char, bool allow_unterminated,
bool cp1252)
{
STRLEN len;
char *s = SvPV_force(sv, len);
@@ -132,7 +163,12 @@
}
if (ok) {
#ifdef UNICODE_HTML_PARSER
- if (!SvUTF8(sv) && num <= 255) {
+ if (cp1252 && num >= 128 && num < 160 && cp1252_len[num & 0x7F] > 0) {
+ repl = (char*) cp1252_utf8[num & 0x7F];
+ repl_len = cp1252_len[num & 0x7F];
+ repl_utf8 = 1;
+ }
+ else if (!SvUTF8(sv) && num <= 255) {
buf[0] = (char) num;
repl = buf;
repl_len = 1;
================================================ t/entities3.t
use HTML::Entities qw(decode_entities_cp1252 encode_entities
encode_entities_numeric);
use Test::More tests => 6;
$a = "Våre norske tegn bør æres";
decode_entities_cp1252($a);
is($a, "Våre norske tegn bør æres");
encode_entities($a);
is($a, "Våre norske tegn bør æres");
decode_entities_cp1252($a);
encode_entities_numeric($a);
is($a, "Våre norske tegn bør æres");
# See how well it does against CP1252
$ent = $hexent = $plain = "";
while () {
next unless /^(0x[0-9a-f]{2})\t(0x[0-9a-f]{4})?/i;
$hexnum = hex($1);
$ent .= "$hexnum;";
$hexent .= sprintf("%x;", $hexnum);
$plain .= defined($2) ? chr(hex($2)) : chr($hexnum);
}
$a = $ent;
decode_entities_cp1252($a);
is($a, $plain);
$a = $hexent;
decode_entities_cp1252($a);
is($a, $plain);
# Decoding of '
is(decode_entities_cp1252("'"), "'");
__END__
# ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS /CP1252.TXT
0x80 0x20AC #EURO SIGN
0x81 #UNDEFINED
0x82 0x201A #SINGLE LOW-9 QUOTATION MARK
0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK
0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK
0x85 0x2026 #HORIZONTAL ELLIPSIS
0x86 0x2020 #DAGGER
0x87 0x2021 #DOUBLE DAGGER
0x88 0x02C6 #MODIFIER LETTER CIRCUMFLEX ACCENT
0x89 0x2030 #PER MILLE SIGN
0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON
0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8C 0x0152 #LATIN CAPITAL LIGATURE OE
0x8D #UNDEFINED
0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON
0x8F #UNDEFINED
0x90 #UNDEFINED
0x91 0x2018 #LEFT SINGLE QUOTATION MARK
0x92 0x2019 #RIGHT SINGLE QUOTATION MARK
0x93 0x201C #LEFT DOUBLE QUOTATION MARK
0x94 0x201D #RIGHT DOUBLE QUOTATION MARK
0x95 0x2022 #BULLET
0x96 0x2013 #EN DASH
0x97 0x2014 #EM DASH
0x98 0x02DC #SMALL TILDE
0x99 0x2122 #TRADE MARK SIGN
0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON
0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9C 0x0153 #LATIN SMALL LIGATURE OE
0x9D #UNDEFINED
0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON
0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS
================================================
--
GPG Key ID: 366A375B
GPG Key Fingerprint: 485E 5041 17E1 E2BB C263 E4DE C8E3 FA36 366A 375B