Index: HTMLparser.c @@ -1374,6 +1374,99 @@ return(0); } +/** + * htmlEncodeEntities: + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of UTF-8 chars + * @inlen: the length of @in + * @quoteChar: the quote character to escape (' or ") or zero. + * + * Take a block of UTF-8 chars in and try to convert it to an ASCII + * plus HTML entities block of chars out. + * + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of octets consumed. + */ +int +htmlEncodeEntities(unsigned char* out, int *outlen, + const unsigned char* in, int *inlen, int quoteChar) { + const unsigned char* processed = in; + const unsigned char* outend = out + (*outlen); + const unsigned char* outstart = out; + const unsigned char* instart = in; + const unsigned char* inend = in + (*inlen); + unsigned int c, d; + int trailing; + + while (in < inend) { + d = *in++; + if (d < 0x80) { c= d; trailing= 0; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } + else if (d < 0xF8) { c= d & 0x07; trailing= 3; } + else { + /* no chance for this in Ascii */ + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + + if (inend - in < trailing) + break; + + while (trailing--) { + if (((d= *in++) & 0xC0) != 0x80) { + *outlen = out - outstart; + *inlen = processed - instart; + return(-2); + } + c <<= 6; + c |= d & 0x3F; + } + + /* assertion: c is a single UTF-4 value */ + if (c < 0x80 && c != quoteChar && c != '&' && c != '<' && c != '>') { + if (out >= outend) + break; + *out++ = c; + } else { + htmlEntityDescPtr ent; + const char *cp; + char nbuf[16]; + int len; + + /* + * Try to lookup a predefined HTML entity for it + */ + ent = htmlEntityValueLookup(c); + if (ent == NULL) { + sprintf(nbuf, "#%u", c); + cp = nbuf; + } + else + cp = ent->name; + len = strlen(cp); + if (out + 2 + len > outend) + break; + *out++ = '&'; + memcpy(out, cp, len); + out += len; + *out++ = ';'; + } + processed = in; + } + *outlen = out - outstart; + *inlen = processed - instart; + return(0); +} /** * htmlDecodeEntities: Index: HTMLparser.h @@ -86,6 +86,10 @@ int *outlen, const unsigned char* in, int *inlen); +int htmlEncodeEntities(unsigned char* out, + int *outlen, + const unsigned char* in, + int *inlen, int quoteChar); /** * Interfaces for the Push mode Index: testHTML.c @@ -368,8 +368,19 @@ if (atts != NULL) { for (i = 0;(atts[i] != NULL);i++) { fprintf(stdout, ", %s", atts[i++]); - if (atts[i] != NULL) - fprintf(stdout, "='%s'", atts[i]); + if (atts[i] != NULL) { + unsigned char output[40]; + const unsigned char *att = atts[i]; + int outlen, attlen; + fprintf(stdout, "='"); + while ((attlen = strlen((char*)att)) > 0) { + outlen = sizeof output - 1; + htmlEncodeEntities(output, &outlen, att, &attlen, '\''); + fprintf(stdout, "%.*s", outlen, output); + att += attlen; + } + fprintf(stdout, "'"); + } } } fprintf(stdout, ")\n"); @@ -400,12 +411,11 @@ void charactersDebug(void *ctx, const xmlChar *ch, int len) { - char output[40]; - int i; + unsigned char output[40]; + int outlen = 30; - for (i = 0;(i