From: Bjorn Reese (breese@mail1.stofanet.dk)
Date: Sat Oct 14 2000 - 13:26:26 EDT
Attached is patch for the HTML SCRIPT element, whose contents should
not be parse or encoded. The regression tests breaks, but that is
because the results of the regression tests are wrong.
A similar problem exists for the HTML STYLE element (which should be
straight-forward to add), and for attributes containing script code.
The latter could be solved by recognizing quotes for the attributes.
My patch does not include any of this.
I have also attached a simplistic .html file used for testing.
*** ../libxml2-2.2.5/HTMLparser.c Wed Oct 11 01:12:56 2000
--- HTMLparser.c Sat Oct 14 19:13:14 2000
***************
*** 2210,2215 ****
--- 2210,2242 ----
}
/**
+ * htmlParseScript:
+ * @ctxt: an HTML parser context
+ */
+ void htmlParseScript(htmlParserCtxtPtr ctxt) {
+ int i = 0;
+ const xmlChar *start = CUR_PTR;
+
+ while (IS_CHAR(CUR)) {
+ if ((CUR == '<') && (NXT(1) == '/') &&
+ (UPP(2) == 'S') && (UPP(3) == 'C') &&
+ (UPP(4) == 'R') && (UPP(5) == 'I') &&
+ (UPP(6) == 'P') && (UPP(7) == 'T') &&
+ (NXT(8) == '>'))
+ break; /* while */
+ NEXT;
+ }
+ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
+ if (ctxt->sax->cdataBlock!= NULL) {
+ /*
+ * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
+ */
+ ctxt->sax->cdataBlock(ctxt->userData, start, CUR_PTR - start);
+ }
+ }
+ }
+
+ /**
* htmlParseCharData:
* @ctxt: an HTML parser context
* @cdata: int indicating whether we are within a CDATA section
***************
*** 3112,3179 ****
return;
}
! /*
! * Sometimes DOCTYPE arrives in the middle of the document
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (UPP(2) == 'D') && (UPP(3) == 'O') &&
! (UPP(4) == 'C') && (UPP(5) == 'T') &&
! (UPP(6) == 'Y') && (UPP(7) == 'P') &&
! (UPP(8) == 'E')) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Misplaced DOCTYPE declaration\n");
! ctxt->wellFormed = 0;
! htmlParseDocTypeDecl(ctxt);
! }
! /*
! * First case : a comment
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (NXT(2) == '-') && (NXT(3) == '-')) {
! htmlParseComment(ctxt);
! }
! /*
! * Second case : a sub-element.
! */
! else if (CUR == '<') {
! htmlParseElement(ctxt);
! }
! /*
! * Third case : a reference. If if has not been resolved,
! * parsing returns it's Name, create the node
! */
! else if (CUR == '&') {
! htmlParseReference(ctxt);
! }
! /*
! * Fourth : end of the resource
! */
! else if (CUR == 0) {
! htmlAutoClose(ctxt, NULL);
! }
! /*
! * Last case, text. Note that References are handled directly.
! */
! else {
! htmlParseCharData(ctxt, 0);
! }
! if (cons == ctxt->nbChars) {
! if (ctxt->node != NULL) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "detected an error in element content\n");
! ctxt->wellFormed = 0;
}
- break;
}
-
GROW;
}
if (currentNode != NULL) xmlFree(currentNode);
--- 3139,3212 ----
return;
}
! if (xmlStrEqual(currentNode, "script")) {
! /*
! * Handle SCRIPT separately
! */
! htmlParseScript(ctxt);
! } else {
! /*
! * Sometimes DOCTYPE arrives in the middle of the document
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (UPP(2) == 'D') && (UPP(3) == 'O') &&
! (UPP(4) == 'C') && (UPP(5) == 'T') &&
! (UPP(6) == 'Y') && (UPP(7) == 'P') &&
! (UPP(8) == 'E')) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Misplaced DOCTYPE declaration\n");
! ctxt->wellFormed = 0;
! htmlParseDocTypeDecl(ctxt);
! }
! /*
! * First case : a comment
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (NXT(2) == '-') && (NXT(3) == '-')) {
! htmlParseComment(ctxt);
! }
! /*
! * Second case : a sub-element.
! */
! else if (CUR == '<') {
! htmlParseElement(ctxt);
! }
! /*
! * Third case : a reference. If if has not been resolved,
! * parsing returns it's Name, create the node
! */
! else if (CUR == '&') {
! htmlParseReference(ctxt);
! }
! /*
! * Fourth : end of the resource
! */
! else if (CUR == 0) {
! htmlAutoClose(ctxt, NULL);
! }
! /*
! * Last case, text. Note that References are handled directly.
! */
! else {
! htmlParseCharData(ctxt, 0);
! }
! if (cons == ctxt->nbChars) {
! if (ctxt->node != NULL) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "detected an error in element content\n");
! ctxt->wellFormed = 0;
! }
! break;
}
}
GROW;
}
if (currentNode != NULL) xmlFree(currentNode);
*** ../libxml2-2.2.5/HTMLtree.c Wed Oct 11 01:12:56 2000
--- HTMLtree.c Sat Oct 14 19:04:13 2000
***************
*** 818,823 ****
--- 818,833 ----
xmlOutputBufferWriteString(buf, ";");
return;
}
+ if (cur->type == HTML_PRESERVE_NODE) {
+ if (cur->content != NULL) {
+ #ifndef XML_USE_BUFFER_CONTENT
+ xmlOutputBufferWriteString(buf, (const char *)cur->content);
+ #else
+ xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
+ #endif
+ }
+ return;
+ }
/*
* Get specific HTmL info for taht node.
*** ../libxml2-2.2.5/include/libxml/HTMLtree.h Wed Oct 11 01:12:56 2000
--- include/libxml/HTMLtree.h Sat Oct 14 18:51:19 2000
***************
*** 22,27 ****
--- 22,28 ----
#define HTML_TEXT_NODE XML_TEXT_NODE
#define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE
#define HTML_COMMENT_NODE XML_COMMENT_NODE
+ #define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE
htmlDocPtr htmlNewDoc (const xmlChar *URI,
const xmlChar *ExternalID);
---- Message from the list xml@rpmfind.net Archived at : http://xmlsoft.org/messages/ to unsubscribe: echo "unsubscribe xml" | mail majordomo@rpmfind.net
This archive was generated by hypermail 2b29 : Sat Oct 14 2000 - 13:43:19 EDT