Index: HTMLparser.c @@ -607,7 +607,7 @@ */ htmlElemDescPtr htmlTagLookup(const xmlChar *tag) { - int i = 0; + int i; for (i = 0; i < (sizeof(html40ElementTable) / sizeof(html40ElementTable[0]));i++) { @@ -911,10 +911,11 @@ htmlEntityDesc html40EntitiesTable[] = { /* - * the 4 absolute ones, + * the 4 absolute ones, plus apostrophe. */ { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, { 38, "amp", "ampersand, U+0026 ISOnum" }, +{ 39, "apos", "single quote" }, { 60, "lt", "less-than sign, U+003C ISOnum" }, { 62, "gt", "greater-than sign, U+003E ISOnum" }, @@ -922,7 +923,6 @@ * A bunch still in the 128-255 range * Replacing them depend really on the charset used. */ -{ 39, "apos", "single quote" }, { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, { 162, "cent", "cent sign, U+00A2 ISOnum" }, @@ -1020,11 +1020,20 @@ { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, +{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, +{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, +{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, +{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, +{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, + /* * Anything below should really be kept as entities references */ { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, +{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, +{ 732, "tilde","small tilde, U+02DC ISOdia" }, + { 913, "Alpha","greek capital letter alpha, U+0391" }, { 914, "Beta", "greek capital letter beta, U+0392" }, { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, @@ -1079,15 +1088,42 @@ { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, +{ 8194, "ensp", "en space, U+2002 ISOpub" }, +{ 8195, "emsp", "em space, U+2003 ISOpub" }, +{ 8201, "thinsp","thin space, U+2009 ISOpub" }, +{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, +{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, +{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, +{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, +{ 8211, "ndash","en dash, U+2013 ISOpub" }, +{ 8212, "mdash","em dash, U+2014 ISOpub" }, +{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, +{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, +{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, +{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, +{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, +{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, +{ 8224, "dagger","dagger, U+2020 ISOpub" }, +{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, + { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, + +{ 8240, "permil","per mille sign, U+2030 ISOtech" }, + { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, + +{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, +{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, + { 8254, "oline","overline = spacing overscore, U+203E NEW" }, { 8260, "frasl","fraction slash, U+2044 NEW" }, -{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, +{ 8364, "euro", "euro sign, U+20AC NEW" }, + { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, +{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, { 8482, "trade","trade mark sign, U+2122 ISOnum" }, { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, @@ -1103,7 +1139,6 @@ { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, - { 8704, "forall","for all, U+2200 ISOtech" }, { 8706, "part", "partial differential, U+2202 ISOtech" }, { 8707, "exist","there exists, U+2203 ISOtech" }, @@ -1155,35 +1190,6 @@ { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, { 9830, "diams","black diamond suit, U+2666 ISOpub" }, -{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, -{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, -{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, -{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, -{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, -{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, -{ 732, "tilde","small tilde, U+02DC ISOdia" }, - -{ 8194, "ensp", "en space, U+2002 ISOpub" }, -{ 8195, "emsp", "em space, U+2003 ISOpub" }, -{ 8201, "thinsp","thin space, U+2009 ISOpub" }, -{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, -{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, -{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, -{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, -{ 8211, "ndash","en dash, U+2013 ISOpub" }, -{ 8212, "mdash","em dash, U+2014 ISOpub" }, -{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, -{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, -{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, -{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, -{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, -{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, -{ 8224, "dagger","dagger, U+2020 ISOpub" }, -{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, -{ 8240, "permil","per mille sign, U+2030 ISOtech" }, -{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, -{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, -{ 8364, "euro", "euro sign, U+20AC NEW" } }; /************************************************************************ @@ -1205,7 +1211,7 @@ } /** - * htmlEntityLookup: + * htmlEntityNameLookup: * @name: the entity name * * Lookup the given entity in EntitiesTable @@ -1215,7 +1221,7 @@ * Returns the associated htmlEntityDescPtr if found, NULL otherwise. */ htmlEntityDescPtr -htmlEntityLookup(const xmlChar *name) { +htmlEntityNameLookup(const xmlChar *name) { int i; for (i = 0;i < (sizeof(html40EntitiesTable)/ @@ -1231,6 +1237,44 @@ } /** + * htmlEntityValueLookup: + * @value: the entity's unicode value + * + * Lookup the given entity in EntitiesTable + * + * TODO: the linear scan is really ugly, an hash table is really needed. + * + * Returns the associated htmlEntityDescPtr if found, NULL otherwise. + */ +htmlEntityDescPtr +htmlEntityValueLookup(int value) { + int i; +#ifdef DEBUG + int lv = 0; +#endif + + for (i = 0;i < (sizeof(html40EntitiesTable)/ + sizeof(html40EntitiesTable[0]));i++) { + if (html40EntitiesTable[i].value >= value) { + if (html40EntitiesTable[i].value > value) + break; +#ifdef DEBUG + fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name); +#endif + return(&html40EntitiesTable[i]); + } +#ifdef DEBUG + if (lv > html40EntitiesTable[i].value) { + fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n", + lv, html40EntitiesTable[i].value); + } + lv = html40EntitiesTable[i].value; +#endif + } + return(NULL); +} + +/** * UTF8ToHtml: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out @@ -1297,39 +1341,63 @@ /* assertion: c is a single UTF-4 value */ if (c < 0x80) { - if (out >= outend) + switch (c) { + case '&': + if (out + 5 > outend) { + *outlen = out - outstart; + *inlen = processed - instart; + return(0); + } + memcpy(out, "&", 5); + out += 5; + break; + case '<': + if (out + 4 > outend) { + *outlen = out - outstart; + *inlen = processed - instart; + return(0); + } + memcpy(out, "<", 4); + out += 4; + break; + case '>': + if (out + 4 > outend) { + *outlen = out - outstart; + *inlen = processed - instart; + return(0); + } + memcpy(out, ">", 4); + out += 4; break; - *out++ = c; + default: + if (out >= outend) { + *outlen = out - outstart; + *inlen = processed - instart; + return(0); + } + *out++ = c; + break; + } } else { - int i, j, len; + int len; + htmlEntityDescPtr ent; /* * Try to lookup a predefined HTML entity for it */ - for (i = 0;i < (sizeof(html40EntitiesTable)/ - sizeof(html40EntitiesTable[0]));i++) { - if (html40EntitiesTable[i].value == c) { -#ifdef DEBUG - fprintf(stderr,"Found entity %s\n", - html40EntitiesTable[i].name); -#endif - goto found_ent; - } - if (html40EntitiesTable[i].value > c) - break; + ent = htmlEntityValueLookup(c); + if (!ent) { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); } - - /* no chance for this in Ascii */ - *outlen = out - outstart; - *inlen = processed - instart; - return(-2); -found_ent: - len = strlen(html40EntitiesTable[i].name); - if (out + 2 + len >= outend) + len = strlen(ent->name); + if (out + 2 + len > outend) break; *out++ = '&'; - for (j = 0;j < len;j++) - *out++ = html40EntitiesTable[i].name[j]; + memcpy(out, ent->name, len); + out += len; *out++ = ';'; } processed = in; @@ -1935,7 +2003,7 @@ /* * Lookup the entity in the table. */ - ent = htmlEntityLookup(name); + ent = htmlEntityNameLookup(name); if (ent != NULL) /* OK that's ugly !!! */ NEXT; } else { Index: HTMLparser.h @@ -57,7 +57,8 @@ * There is only few public functions. */ htmlElemDescPtr htmlTagLookup (const xmlChar *tag); -htmlEntityDescPtr htmlEntityLookup(const xmlChar *name); +htmlEntityDescPtr htmlEntityNameLookup(const xmlChar *name); +htmlEntityDescPtr htmlEntityValueLookup(int value); int htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem);