[xml] Patch for HTML SCRIPT

Date view Thread view Subject view Author view

From: Bjorn Reese (breese@mail1.stofanet.dk)
Date: Sat Oct 14 2000 - 13:26:26 EDT


Attached is patch for the HTML SCRIPT element, whose contents should
not be parse or encoded. The regression tests breaks, but that is
because the results of the regression tests are wrong.

A similar problem exists for the HTML STYLE element (which should be
straight-forward to add), and for attributes containing script code.
The latter could be solved by recognizing quotes for the attributes.
My patch does not include any of this.

I have also attached a simplistic .html file used for testing.

*** ../libxml2-2.2.5/HTMLparser.c Wed Oct 11 01:12:56 2000
--- HTMLparser.c Sat Oct 14 19:13:14 2000
***************
*** 2210,2215 ****
--- 2210,2242 ----
  }
  
  /**
+ * htmlParseScript:
+ * @ctxt: an HTML parser context
+ */
+ void htmlParseScript(htmlParserCtxtPtr ctxt) {
+ int i = 0;
+ const xmlChar *start = CUR_PTR;
+
+ while (IS_CHAR(CUR)) {
+ if ((CUR == '<') && (NXT(1) == '/') &&
+ (UPP(2) == 'S') && (UPP(3) == 'C') &&
+ (UPP(4) == 'R') && (UPP(5) == 'I') &&
+ (UPP(6) == 'P') && (UPP(7) == 'T') &&
+ (NXT(8) == '>'))
+ break; /* while */
+ NEXT;
+ }
+ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
+ if (ctxt->sax->cdataBlock!= NULL) {
+ /*
+ * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
+ */
+ ctxt->sax->cdataBlock(ctxt->userData, start, CUR_PTR - start);
+ }
+ }
+ }
+
+ /**
   * htmlParseCharData:
   * @ctxt: an HTML parser context
   * @cdata: int indicating whether we are within a CDATA section
***************
*** 3112,3179 ****
              return;
          }
  
! /*
! * Sometimes DOCTYPE arrives in the middle of the document
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (UPP(2) == 'D') && (UPP(3) == 'O') &&
! (UPP(4) == 'C') && (UPP(5) == 'T') &&
! (UPP(6) == 'Y') && (UPP(7) == 'P') &&
! (UPP(8) == 'E')) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Misplaced DOCTYPE declaration\n");
! ctxt->wellFormed = 0;
! htmlParseDocTypeDecl(ctxt);
! }
  
! /*
! * First case : a comment
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (NXT(2) == '-') && (NXT(3) == '-')) {
! htmlParseComment(ctxt);
! }
  
! /*
! * Second case : a sub-element.
! */
! else if (CUR == '<') {
! htmlParseElement(ctxt);
! }
  
! /*
! * Third case : a reference. If if has not been resolved,
! * parsing returns it's Name, create the node
! */
! else if (CUR == '&') {
! htmlParseReference(ctxt);
! }
  
! /*
! * Fourth : end of the resource
! */
! else if (CUR == 0) {
! htmlAutoClose(ctxt, NULL);
! }
  
! /*
! * Last case, text. Note that References are handled directly.
! */
! else {
! htmlParseCharData(ctxt, 0);
! }
  
! if (cons == ctxt->nbChars) {
! if (ctxt->node != NULL) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "detected an error in element content\n");
! ctxt->wellFormed = 0;
              }
- break;
          }
-
          GROW;
      }
      if (currentNode != NULL) xmlFree(currentNode);
--- 3139,3212 ----
              return;
          }
  
! if (xmlStrEqual(currentNode, "script")) {
! /*
! * Handle SCRIPT separately
! */
! htmlParseScript(ctxt);
! } else {
! /*
! * Sometimes DOCTYPE arrives in the middle of the document
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (UPP(2) == 'D') && (UPP(3) == 'O') &&
! (UPP(4) == 'C') && (UPP(5) == 'T') &&
! (UPP(6) == 'Y') && (UPP(7) == 'P') &&
! (UPP(8) == 'E')) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "Misplaced DOCTYPE declaration\n");
! ctxt->wellFormed = 0;
! htmlParseDocTypeDecl(ctxt);
! }
  
! /*
! * First case : a comment
! */
! if ((CUR == '<') && (NXT(1) == '!') &&
! (NXT(2) == '-') && (NXT(3) == '-')) {
! htmlParseComment(ctxt);
! }
  
! /*
! * Second case : a sub-element.
! */
! else if (CUR == '<') {
! htmlParseElement(ctxt);
! }
  
! /*
! * Third case : a reference. If if has not been resolved,
! * parsing returns it's Name, create the node
! */
! else if (CUR == '&') {
! htmlParseReference(ctxt);
! }
  
! /*
! * Fourth : end of the resource
! */
! else if (CUR == 0) {
! htmlAutoClose(ctxt, NULL);
! }
  
! /*
! * Last case, text. Note that References are handled directly.
! */
! else {
! htmlParseCharData(ctxt, 0);
! }
  
! if (cons == ctxt->nbChars) {
! if (ctxt->node != NULL) {
! if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
! ctxt->sax->error(ctxt->userData,
! "detected an error in element content\n");
! ctxt->wellFormed = 0;
! }
! break;
              }
          }
          GROW;
      }
      if (currentNode != NULL) xmlFree(currentNode);
*** ../libxml2-2.2.5/HTMLtree.c Wed Oct 11 01:12:56 2000
--- HTMLtree.c Sat Oct 14 19:04:13 2000
***************
*** 818,823 ****
--- 818,833 ----
          xmlOutputBufferWriteString(buf, ";");
          return;
      }
+ if (cur->type == HTML_PRESERVE_NODE) {
+ if (cur->content != NULL) {
+ #ifndef XML_USE_BUFFER_CONTENT
+ xmlOutputBufferWriteString(buf, (const char *)cur->content);
+ #else
+ xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content));
+ #endif
+ }
+ return;
+ }
  
      /*
       * Get specific HTmL info for taht node.
*** ../libxml2-2.2.5/include/libxml/HTMLtree.h Wed Oct 11 01:12:56 2000
--- include/libxml/HTMLtree.h Sat Oct 14 18:51:19 2000
***************
*** 22,27 ****
--- 22,28 ----
  #define HTML_TEXT_NODE XML_TEXT_NODE
  #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE
  #define HTML_COMMENT_NODE XML_COMMENT_NODE
+ #define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE
  
  htmlDocPtr htmlNewDoc (const xmlChar *URI,
                                           const xmlChar *ExternalID);

Script tests

----
Message from the list xml@rpmfind.net
Archived at : http://xmlsoft.org/messages/
to unsubscribe: echo "unsubscribe xml" | mail  majordomo@rpmfind.net


Date view Thread view Subject view Author view

This archive was generated by hypermail 2b29 : Sat Oct 14 2000 - 13:43:19 EDT