[xml] Minor parser optimizations

Date view Thread view Subject view Author view

From: Bjorn Reese (breese@mail1.stofanet.dk)
Date: Mon Nov 27 2000 - 10:57:50 EST


I have attached a patch with minor code optimizations for the parser,
that I did while looking at the parsing process with Quantify (I
started with Purify, but it didn't report any errors, so I decided
to look at performance instead). I only looked at the XML parser part.

The changes to parser.c is really only a single change -- the rest is
indentation.

The changes to parserInternals.c are all done to handle the normal
flow for ASCII characters better (which is what most XML documents
mainly consists of). I tried various combinations and the ones in
the attachment performed best. Even the stange modification in
xmlIsIdeographic improves performance (because no ASCII character
matches the given ranges, it will try all ranges, so I prevented
this).

The patch is against 2-2.2.10.

diff -c libxml2-2.2.10/parser.c libxml2-2.2.10-breese/parser.c
*** libxml2-2.2.10/parser.c Sat Nov 25 11:39:37 2000
--- libxml2-2.2.10-breese/parser.c Mon Nov 27 16:42:27 2000
***************
*** 2092,2166 ****
                  buf[len++] = *current++;
              }
              ctxt->token = 0;
! } else if ((c == '&') && (NXT(1) == '#')) {
! int val = xmlParseCharRef(ctxt);
! if (val == '&') {
! /*
! * The reparsing will be done in xmlStringGetNodeList()
! * called by the attribute() function in SAX.c
! */
! static xmlChar buffer[6] = "&";
  
! if (len > buf_size - 10) {
! growBuffer(buf);
! }
! current = &buffer[0];
! while (*current != 0) { /* non input consuming */
! buf[len++] = *current++;
                  }
              } else {
! len += xmlCopyChar(0, &buf[len], val);
! }
! } else if (c == '&') {
! ent = xmlParseEntityRef(ctxt);
! if ((ent != NULL) &&
! (ctxt->replaceEntities != 0)) {
! xmlChar *rep;
  
! if (ent->etype != XML_INTERNAL_PREDEFINED_ENTITY) {
! rep = xmlStringDecodeEntities(ctxt, ent->content,
! XML_SUBSTITUTE_REF, 0, 0, 0);
! if (rep != NULL) {
! current = rep;
! while (*current != 0) { /* non input consuming */
! buf[len++] = *current++;
! if (len > buf_size - 10) {
! growBuffer(buf);
                              }
                          }
! xmlFree(rep);
                      }
! } else {
! if (ent->content != NULL)
! buf[len++] = ent->content[0];
! }
! } else if (ent != NULL) {
! int i = xmlStrlen(ent->name);
! const xmlChar *cur = ent->name;
  
! /*
! * This may look absurd but is needed to detect
! * entities problems
! */
! if ((ent->etype != XML_INTERNAL_PREDEFINED_ENTITY) &&
! (ent->content != NULL)) {
! xmlChar *rep;
! rep = xmlStringDecodeEntities(ctxt, ent->content,
! XML_SUBSTITUTE_REF, 0, 0, 0);
! if (rep != NULL)
! xmlFree(rep);
! }
  
! /*
! * Just output the reference
! */
! buf[len++] = '&';
! if (len > buf_size - i - 10) {
! growBuffer(buf);
                  }
- for (;i > 0;i--)
- buf[len++] = *cur++;
- buf[len++] = ';';
              }
          } else {
              if ((c == 0x20) || (c == 0xD) || (c == 0xA) || (c == 0x9)) {
--- 2092,2168 ----
                  buf[len++] = *current++;
              }
              ctxt->token = 0;
! } else if (c == '&') {
! if (NXT(1) == '#') {
! int val = xmlParseCharRef(ctxt);
! if (val == '&') {
! /*
! * The reparsing will be done in xmlStringGetNodeList()
! * called by the attribute() function in SAX.c
! */
! static xmlChar buffer[6] = "&";
  
! if (len > buf_size - 10) {
! growBuffer(buf);
! }
! current = &buffer[0];
! while (*current != 0) { /* non input consuming */
! buf[len++] = *current++;
! }
! } else {
! len += xmlCopyChar(0, &buf[len], val);
                  }
              } else {
! ent = xmlParseEntityRef(ctxt);
! if ((ent != NULL) &&
! (ctxt->replaceEntities != 0)) {
! xmlChar *rep;
  
! if (ent->etype != XML_INTERNAL_PREDEFINED_ENTITY) {
! rep = xmlStringDecodeEntities(ctxt, ent->content,
! XML_SUBSTITUTE_REF, 0, 0, 0);
! if (rep != NULL) {
! current = rep;
! while (*current != 0) { /* non input consuming */
! buf[len++] = *current++;
! if (len > buf_size - 10) {
! growBuffer(buf);
! }
                              }
+ xmlFree(rep);
                          }
! } else {
! if (ent->content != NULL)
! buf[len++] = ent->content[0];
                      }
! } else if (ent != NULL) {
! int i = xmlStrlen(ent->name);
! const xmlChar *cur = ent->name;
  
! /*
! * This may look absurd but is needed to detect
! * entities problems
! */
! if ((ent->etype != XML_INTERNAL_PREDEFINED_ENTITY) &&
! (ent->content != NULL)) {
! xmlChar *rep;
! rep = xmlStringDecodeEntities(ctxt, ent->content,
! XML_SUBSTITUTE_REF, 0, 0, 0);
! if (rep != NULL)
! xmlFree(rep);
! }
  
! /*
! * Just output the reference
! */
! buf[len++] = '&';
! if (len > buf_size - i - 10) {
! growBuffer(buf);
! }
! for (;i > 0;i--)
! buf[len++] = *cur++;
! buf[len++] = ';';
                  }
              }
          } else {
              if ((c == 0x20) || (c == 0xD) || (c == 0xA) || (c == 0x9)) {
diff -c libxml2-2.2.10/parserInternals.c libxml2-2.2.10-breese/parserInternals.c
*** libxml2-2.2.10/parserInternals.c Mon Nov 13 12:44:00 2000
--- libxml2-2.2.10-breese/parserInternals.c Mon Nov 27 16:35:57 2000
***************
*** 428,442 ****
   *
   * Returns 0 if not, non-zero otherwise
   */
  int
  xmlIsBaseChar(int c) {
      return(
! (((c) >= 0x0041) && ((c) <= 0x005A)) ||
! (((c) >= 0x0061) && ((c) <= 0x007A)) ||
! (((c) >= 0x00C0) && ((c) <= 0x00D6)) ||
! (((c) >= 0x00D8) && ((c) <= 0x00F6)) ||
! (((c) >= 0x00F8) && ((c) <= 0x00FF)) ||
! (((c) >= 0x100) && ( /* accelerator */
        (((c) >= 0x0100) && ((c) <= 0x0131)) ||
        (((c) >= 0x0134) && ((c) <= 0x013E)) ||
        (((c) >= 0x0141) && ((c) <= 0x0148)) ||
--- 428,457 ----
   *
   * Returns 0 if not, non-zero otherwise
   */
+ static int xmlBaseArray[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0000 - 0x000F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0010 - 0x001F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0020 - 0x002F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0030 - 0x003F */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x0040 - 0x004F */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x0050 - 0x005F */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x0060 - 0x006F */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x0070 - 0x007F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0080 - 0x008F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x0090 - 0x009F */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00A0 - 0x00AF */
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00B0 - 0x00BF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00C0 - 0x00CF */
+ 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00D0 - 0x00DF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00E0 - 0x00EF */
+ 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00F0 - 0x00FF */
+ };
+
  int
  xmlIsBaseChar(int c) {
      return(
! (((c) < 0x0100) ? xmlBaseArray[c] :
! ( /* accelerator */
        (((c) >= 0x0100) && ((c) <= 0x0131)) ||
        (((c) >= 0x0134) && ((c) <= 0x013E)) ||
        (((c) >= 0x0141) && ((c) <= 0x0148)) ||
***************
*** 794,806 ****
   */
  int
  xmlIsExtender(int c) {
! return(
! ((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) ||
! ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) ||
! ((c) == 0xec6) || ((c) == 0x3005) ||
! (((c) >= 0x3031) && ((c) <= 0x3035)) ||
! (((c) >= 0x309b) && ((c) <= 0x309e)) ||
! (((c) >= 0x30fc) && ((c) <= 0x30fe)));
  }
  
  /**
--- 809,824 ----
   */
  int
  xmlIsExtender(int c) {
! switch (c) {
! case 0x00B7: case 0x02D0: case 0x02D1: case 0x0387:
! case 0x0640: case 0x0E46: case 0x0EC6: case 0x3005:
! case 0x3031: case 0x3032: case 0x3033: case 0x3034:
! case 0x3035: case 0x309D: case 0x309E: case 0x30FC:
! case 0x30FE:
! return 1;
! default:
! return 0;
! }
  }
  
  /**
***************
*** 814,820 ****
   */
  int
  xmlIsIdeographic(int c) {
! return(
       (((c) >= 0x4e00) && ((c) <= 0x9fa5)) ||
       (((c) >= 0xf900) && ((c) <= 0xfa2d)) ||
       (((c) >= 0x3021) && ((c) <= 0x3029)) ||
--- 832,838 ----
   */
  int
  xmlIsIdeographic(int c) {
! return(((c) < 0x0100) ? 0 :
       (((c) >= 0x4e00) && ((c) <= 0x9fa5)) ||
       (((c) >= 0xf900) && ((c) <= 0xfa2d)) ||
       (((c) >= 0x3021) && ((c) <= 0x3029)) ||

----
Message from the list xml@rpmfind.net
Archived at : http://xmlsoft.org/messages/
to unsubscribe: echo "unsubscribe xml" | mail  majordomo@rpmfind.net


Date view Thread view Subject view Author view

This archive was generated by hypermail 2b29 : Mon Nov 27 2000 - 11:43:56 EST