Jabber WindowGram Client (JWGC)

Introduction Screenshots Installation Downloads
Documentation Browse Source Resources Project Site

Stable Version
-none-

Latest Version
beta5



Main Page | Alphabetical List | Data Structures | Directories | File List | Data Fields | Globals

xmltok_impl.c

Go to the documentation of this file.
00001 /*
00002 Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
00003 See the file COPYING for copying permission.
00004 */
00005 
00006 #ifndef IS_INVALID_CHAR
00007 #define IS_INVALID_CHAR(enc, ptr, n) (0)
00008 #endif
00009 
00010 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
00011     case BT_LEAD ## n: \
00012       if (end - ptr < n) \
00013         return XML_TOK_PARTIAL_CHAR; \
00014       if (IS_INVALID_CHAR(enc, ptr, n)) { \
00015         *(nextTokPtr) = (ptr); \
00016         return XML_TOK_INVALID; \
00017       } \
00018       ptr += n; \
00019       break;
00020 
00021 #define INVALID_CASES(ptr, nextTokPtr) \
00022   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
00023   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
00024   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
00025   case BT_NONXML: \
00026   case BT_MALFORM: \
00027   case BT_TRAIL: \
00028     *(nextTokPtr) = (ptr); \
00029     return XML_TOK_INVALID;
00030 
00031 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
00032    case BT_LEAD ## n: \
00033      if (end - ptr < n) \
00034        return XML_TOK_PARTIAL_CHAR; \
00035      if (!IS_NAME_CHAR(enc, ptr, n)) { \
00036        *nextTokPtr = ptr; \
00037        return XML_TOK_INVALID; \
00038      } \
00039      ptr += n; \
00040      break;
00041 
00042 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
00043   case BT_NONASCII: \
00044     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
00045       *nextTokPtr = ptr; \
00046       return XML_TOK_INVALID; \
00047     } \
00048   case BT_NMSTRT: \
00049   case BT_HEX: \
00050   case BT_DIGIT: \
00051   case BT_NAME: \
00052   case BT_MINUS: \
00053     ptr += MINBPC(enc); \
00054     break; \
00055   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
00056   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
00057   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
00058 
00059 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
00060    case BT_LEAD ## n: \
00061      if (end - ptr < n) \
00062        return XML_TOK_PARTIAL_CHAR; \
00063      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
00064        *nextTokPtr = ptr; \
00065        return XML_TOK_INVALID; \
00066      } \
00067      ptr += n; \
00068      break;
00069 
00070 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
00071   case BT_NONASCII: \
00072     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
00073       *nextTokPtr = ptr; \
00074       return XML_TOK_INVALID; \
00075     } \
00076   case BT_NMSTRT: \
00077   case BT_HEX: \
00078     ptr += MINBPC(enc); \
00079     break; \
00080   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
00081   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
00082   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
00083 
00084 #ifndef PREFIX
00085 #define PREFIX(ident) ident
00086 #endif
00087 
00088 /* ptr points to character following "<!-" */
00089 
00090 static
00091 int PREFIX(scanComment)(const ENCODING *enc, const char *ptr, const char *end,
00092                         const char **nextTokPtr)
00093 {
00094   if (ptr != end) {
00095     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
00096       *nextTokPtr = ptr;
00097       return XML_TOK_INVALID;
00098     }
00099     ptr += MINBPC(enc);
00100     while (ptr != end) {
00101       switch (BYTE_TYPE(enc, ptr)) {
00102       INVALID_CASES(ptr, nextTokPtr)
00103       case BT_MINUS:
00104         if ((ptr += MINBPC(enc)) == end)
00105           return XML_TOK_PARTIAL;
00106         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
00107           if ((ptr += MINBPC(enc)) == end)
00108             return XML_TOK_PARTIAL;
00109           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00110             *nextTokPtr = ptr;
00111             return XML_TOK_INVALID;
00112           }
00113           *nextTokPtr = ptr + MINBPC(enc);
00114           return XML_TOK_COMMENT;
00115         }
00116         break;
00117       default:
00118         ptr += MINBPC(enc);
00119         break;
00120       }
00121     }
00122   }
00123   return XML_TOK_PARTIAL;
00124 }
00125 
00126 /* ptr points to character following "<!" */
00127 
00128 static
00129 int PREFIX(scanDecl)(const ENCODING *enc, const char *ptr, const char *end,
00130                      const char **nextTokPtr)
00131 {
00132   if (ptr == end)
00133     return XML_TOK_PARTIAL;
00134   switch (BYTE_TYPE(enc, ptr)) {
00135   case BT_MINUS:
00136     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00137   case BT_LSQB:
00138     *nextTokPtr = ptr + MINBPC(enc);
00139     return XML_TOK_COND_SECT_OPEN;
00140   case BT_NMSTRT:
00141   case BT_HEX:
00142     ptr += MINBPC(enc);
00143     break;
00144   default:
00145     *nextTokPtr = ptr;
00146     return XML_TOK_INVALID;
00147   }
00148   while (ptr != end) {
00149     switch (BYTE_TYPE(enc, ptr)) {
00150     case BT_PERCNT:
00151       if (ptr + MINBPC(enc) == end)
00152         return XML_TOK_PARTIAL;
00153       /* don't allow <!ENTITY% foo "whatever"> */
00154       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
00155       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
00156         *nextTokPtr = ptr;
00157         return XML_TOK_INVALID;
00158       }
00159       /* fall through */
00160     case BT_S: case BT_CR: case BT_LF:
00161       *nextTokPtr = ptr;
00162       return XML_TOK_DECL_OPEN;
00163     case BT_NMSTRT:
00164     case BT_HEX:
00165       ptr += MINBPC(enc);
00166       break;
00167     default:
00168       *nextTokPtr = ptr;
00169       return XML_TOK_INVALID;
00170     }
00171   }
00172   return XML_TOK_PARTIAL;
00173 }
00174 
00175 static
00176 int PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr, const char *end, int *tokPtr)
00177 {
00178   int upper = 0;
00179   *tokPtr = XML_TOK_PI;
00180   if (end - ptr != MINBPC(enc)*3)
00181     return 1;
00182   switch (BYTE_TO_ASCII(enc, ptr)) {
00183   case ASCII_x:
00184     break;
00185   case ASCII_X:
00186     upper = 1;
00187     break;
00188   default:
00189     return 1;
00190   }
00191   ptr += MINBPC(enc);
00192   switch (BYTE_TO_ASCII(enc, ptr)) {
00193   case ASCII_m:
00194     break;
00195   case ASCII_M:
00196     upper = 1;
00197     break;
00198   default:
00199     return 1;
00200   }
00201   ptr += MINBPC(enc);
00202   switch (BYTE_TO_ASCII(enc, ptr)) {
00203   case ASCII_l:
00204     break;
00205   case ASCII_L:
00206     upper = 1;
00207     break;
00208   default:
00209     return 1;
00210   }
00211   if (upper)
00212     return 0;
00213   *tokPtr = XML_TOK_XML_DECL;
00214   return 1;
00215 }
00216 
00217 /* ptr points to character following "<?" */
00218 
00219 static
00220 int PREFIX(scanPi)(const ENCODING *enc, const char *ptr, const char *end,
00221                    const char **nextTokPtr)
00222 {
00223   int tok;
00224   const char *target = ptr;
00225   if (ptr == end)
00226     return XML_TOK_PARTIAL;
00227   switch (BYTE_TYPE(enc, ptr)) {
00228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00229   default:
00230     *nextTokPtr = ptr;
00231     return XML_TOK_INVALID;
00232   }
00233   while (ptr != end) {
00234     switch (BYTE_TYPE(enc, ptr)) {
00235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00236     case BT_S: case BT_CR: case BT_LF:
00237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00238         *nextTokPtr = ptr;
00239         return XML_TOK_INVALID;
00240       }
00241       ptr += MINBPC(enc);
00242       while (ptr != end) {
00243         switch (BYTE_TYPE(enc, ptr)) {
00244         INVALID_CASES(ptr, nextTokPtr)
00245         case BT_QUEST:
00246           ptr += MINBPC(enc);
00247           if (ptr == end)
00248             return XML_TOK_PARTIAL;
00249           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00250             *nextTokPtr = ptr + MINBPC(enc);
00251             return tok;
00252           }
00253           break;
00254         default:
00255           ptr += MINBPC(enc);
00256           break;
00257         }
00258       }
00259       return XML_TOK_PARTIAL;
00260     case BT_QUEST:
00261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
00262         *nextTokPtr = ptr;
00263         return XML_TOK_INVALID;
00264       }
00265       ptr += MINBPC(enc);
00266       if (ptr == end)
00267         return XML_TOK_PARTIAL;
00268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00269         *nextTokPtr = ptr + MINBPC(enc);
00270         return tok;
00271       }
00272       /* fall through */
00273     default:
00274       *nextTokPtr = ptr;
00275       return XML_TOK_INVALID;
00276     }
00277   }
00278   return XML_TOK_PARTIAL;
00279 }
00280 
00281 
00282 static
00283 int PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr, const char *end,
00284                              const char **nextTokPtr)
00285 {
00286   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A, ASCII_T, ASCII_A, ASCII_LSQB };
00287   int i;
00288   /* CDATA[ */
00289   if (end - ptr < 6 * MINBPC(enc))
00290     return XML_TOK_PARTIAL;
00291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
00292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
00293       *nextTokPtr = ptr;
00294       return XML_TOK_INVALID;
00295     }
00296   }
00297   *nextTokPtr = ptr;
00298   return XML_TOK_CDATA_SECT_OPEN;
00299 }
00300 
00301 static
00302 int PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
00303                             const char **nextTokPtr)
00304 {
00305   if (ptr == end)
00306     return XML_TOK_NONE;
00307   if (MINBPC(enc) > 1) {
00308     size_t n = end - ptr;
00309     if (n & (MINBPC(enc) - 1)) {
00310       n &= ~(MINBPC(enc) - 1);
00311       if (n == 0)
00312         return XML_TOK_PARTIAL;
00313       end = ptr + n;
00314     }
00315   }
00316   switch (BYTE_TYPE(enc, ptr)) {
00317   case BT_RSQB:
00318     ptr += MINBPC(enc);
00319     if (ptr == end)
00320       return XML_TOK_PARTIAL;
00321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
00322       break;
00323     ptr += MINBPC(enc);
00324     if (ptr == end)
00325       return XML_TOK_PARTIAL;
00326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00327       ptr -= MINBPC(enc);
00328       break;
00329     }
00330     *nextTokPtr = ptr + MINBPC(enc);
00331     return XML_TOK_CDATA_SECT_CLOSE;
00332   case BT_CR:
00333     ptr += MINBPC(enc);
00334     if (ptr == end)
00335       return XML_TOK_PARTIAL;
00336     if (BYTE_TYPE(enc, ptr) == BT_LF)
00337       ptr += MINBPC(enc);
00338     *nextTokPtr = ptr;
00339     return XML_TOK_DATA_NEWLINE;
00340   case BT_LF:
00341     *nextTokPtr = ptr + MINBPC(enc);
00342     return XML_TOK_DATA_NEWLINE;
00343   INVALID_CASES(ptr, nextTokPtr)
00344   default:
00345     ptr += MINBPC(enc);
00346     break;
00347   }
00348   while (ptr != end) {
00349     switch (BYTE_TYPE(enc, ptr)) {
00350 #define LEAD_CASE(n) \
00351     case BT_LEAD ## n: \
00352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00353         *nextTokPtr = ptr; \
00354         return XML_TOK_DATA_CHARS; \
00355       } \
00356       ptr += n; \
00357       break;
00358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00359 #undef LEAD_CASE
00360     case BT_NONXML:
00361     case BT_MALFORM:
00362     case BT_TRAIL:
00363     case BT_CR:
00364     case BT_LF:
00365     case BT_RSQB:
00366       *nextTokPtr = ptr;
00367       return XML_TOK_DATA_CHARS;
00368     default:
00369       ptr += MINBPC(enc);
00370       break;
00371     }
00372   }
00373   *nextTokPtr = ptr;
00374   return XML_TOK_DATA_CHARS;
00375 }
00376 
00377 /* ptr points to character following "</" */
00378 
00379 static
00380 int PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr, const char *end,
00381                        const char **nextTokPtr)
00382 {
00383   if (ptr == end)
00384     return XML_TOK_PARTIAL;
00385   switch (BYTE_TYPE(enc, ptr)) {
00386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00387   default:
00388     *nextTokPtr = ptr;
00389     return XML_TOK_INVALID;
00390   }
00391   while (ptr != end) {
00392     switch (BYTE_TYPE(enc, ptr)) {
00393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00394     case BT_S: case BT_CR: case BT_LF:
00395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00396         switch (BYTE_TYPE(enc, ptr)) {
00397         case BT_S: case BT_CR: case BT_LF:
00398           break;
00399         case BT_GT:
00400           *nextTokPtr = ptr + MINBPC(enc);
00401           return XML_TOK_END_TAG;
00402         default:
00403           *nextTokPtr = ptr;
00404           return XML_TOK_INVALID;
00405         }
00406       }
00407       return XML_TOK_PARTIAL;
00408 #ifdef XML_NS
00409     case BT_COLON:
00410       /* no need to check qname syntax here, since end-tag must match exactly */
00411       ptr += MINBPC(enc);
00412       break;
00413 #endif
00414     case BT_GT:
00415       *nextTokPtr = ptr + MINBPC(enc);
00416       return XML_TOK_END_TAG;
00417     default:
00418       *nextTokPtr = ptr;
00419       return XML_TOK_INVALID;
00420     }
00421   }
00422   return XML_TOK_PARTIAL;
00423 }
00424 
00425 /* ptr points to character following "&#X" */
00426 
00427 static
00428 int PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr, const char *end,
00429                            const char **nextTokPtr)
00430 {
00431   if (ptr != end) {
00432     switch (BYTE_TYPE(enc, ptr)) {
00433     case BT_DIGIT:
00434     case BT_HEX:
00435       break;
00436     default:
00437       *nextTokPtr = ptr;
00438       return XML_TOK_INVALID;
00439     }
00440     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00441       switch (BYTE_TYPE(enc, ptr)) {
00442       case BT_DIGIT:
00443       case BT_HEX:
00444         break;
00445       case BT_SEMI:
00446         *nextTokPtr = ptr + MINBPC(enc);
00447         return XML_TOK_CHAR_REF;
00448       default:
00449         *nextTokPtr = ptr;
00450         return XML_TOK_INVALID;
00451       }
00452     }
00453   }
00454   return XML_TOK_PARTIAL;
00455 }
00456 
00457 /* ptr points to character following "&#" */
00458 
00459 static
00460 int PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr, const char *end,
00461                         const char **nextTokPtr)
00462 {
00463   if (ptr != end) {
00464     if (CHAR_MATCHES(enc, ptr, ASCII_x))
00465       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00466     switch (BYTE_TYPE(enc, ptr)) {
00467     case BT_DIGIT:
00468       break;
00469     default:
00470       *nextTokPtr = ptr;
00471       return XML_TOK_INVALID;
00472     }
00473     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
00474       switch (BYTE_TYPE(enc, ptr)) {
00475       case BT_DIGIT:
00476         break;
00477       case BT_SEMI:
00478         *nextTokPtr = ptr + MINBPC(enc);
00479         return XML_TOK_CHAR_REF;
00480       default:
00481         *nextTokPtr = ptr;
00482         return XML_TOK_INVALID;
00483       }
00484     }
00485   }
00486   return XML_TOK_PARTIAL;
00487 }
00488 
00489 /* ptr points to character following "&" */
00490 
00491 static
00492 int PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
00493                     const char **nextTokPtr)
00494 {
00495   if (ptr == end)
00496     return XML_TOK_PARTIAL;
00497   switch (BYTE_TYPE(enc, ptr)) {
00498   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00499   case BT_NUM:
00500     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00501   default:
00502     *nextTokPtr = ptr;
00503     return XML_TOK_INVALID;
00504   }
00505   while (ptr != end) {
00506     switch (BYTE_TYPE(enc, ptr)) {
00507     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00508     case BT_SEMI:
00509       *nextTokPtr = ptr + MINBPC(enc);
00510       return XML_TOK_ENTITY_REF;
00511     default:
00512       *nextTokPtr = ptr;
00513       return XML_TOK_INVALID;
00514     }
00515   }
00516   return XML_TOK_PARTIAL;
00517 }
00518 
00519 /* ptr points to character following first character of attribute name */
00520 
00521 static
00522 int PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
00523                      const char **nextTokPtr)
00524 {
00525 #ifdef XML_NS
00526   int hadColon = 0;
00527 #endif
00528   while (ptr != end) {
00529     switch (BYTE_TYPE(enc, ptr)) {
00530     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00531 #ifdef XML_NS
00532     case BT_COLON:
00533       if (hadColon) {
00534         *nextTokPtr = ptr;
00535         return XML_TOK_INVALID;
00536       }
00537       hadColon = 1;
00538       ptr += MINBPC(enc);
00539       if (ptr == end)
00540         return XML_TOK_PARTIAL;
00541       switch (BYTE_TYPE(enc, ptr)) {
00542       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00543       default:
00544         *nextTokPtr = ptr;
00545         return XML_TOK_INVALID;
00546       }
00547       break;
00548 #endif
00549     case BT_S: case BT_CR: case BT_LF:
00550       for (;;) {
00551         int t;
00552 
00553         ptr += MINBPC(enc);
00554         if (ptr == end)
00555           return XML_TOK_PARTIAL;
00556         t = BYTE_TYPE(enc, ptr);
00557         if (t == BT_EQUALS)
00558           break;
00559         switch (t) {
00560         case BT_S:
00561         case BT_LF:
00562         case BT_CR:
00563           break;
00564         default:
00565           *nextTokPtr = ptr;
00566           return XML_TOK_INVALID;
00567         }
00568       }
00569     /* fall through */
00570     case BT_EQUALS:
00571       {
00572         int open;
00573 #ifdef XML_NS
00574         hadColon = 0;
00575 #endif
00576         for (;;) {
00577           
00578           ptr += MINBPC(enc);
00579           if (ptr == end)
00580             return XML_TOK_PARTIAL;
00581           open = BYTE_TYPE(enc, ptr);
00582           if (open == BT_QUOT || open == BT_APOS)
00583             break;
00584           switch (open) {
00585           case BT_S:
00586           case BT_LF:
00587           case BT_CR:
00588             break;
00589           default:
00590             *nextTokPtr = ptr;
00591             return XML_TOK_INVALID;
00592           }
00593         }
00594         ptr += MINBPC(enc);
00595         /* in attribute value */
00596         for (;;) {
00597           int t;
00598           if (ptr == end)
00599             return XML_TOK_PARTIAL;
00600           t = BYTE_TYPE(enc, ptr);
00601           if (t == open)
00602             break;
00603           switch (t) {
00604           INVALID_CASES(ptr, nextTokPtr)
00605           case BT_AMP:
00606             {
00607               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
00608               if (tok <= 0) {
00609                 if (tok == XML_TOK_INVALID)
00610                   *nextTokPtr = ptr;
00611                 return tok;
00612               }
00613               break;
00614             }
00615           case BT_LT:
00616             *nextTokPtr = ptr;
00617             return XML_TOK_INVALID;
00618           default:
00619             ptr += MINBPC(enc);
00620             break;
00621           }
00622         }
00623         ptr += MINBPC(enc);
00624         if (ptr == end)
00625           return XML_TOK_PARTIAL;
00626         switch (BYTE_TYPE(enc, ptr)) {
00627         case BT_S:
00628         case BT_CR:
00629         case BT_LF:
00630           break;
00631         case BT_SOL:
00632           goto sol;
00633         case BT_GT:
00634           goto gt;
00635         default:
00636           *nextTokPtr = ptr;
00637           return XML_TOK_INVALID;
00638         }
00639         /* ptr points to closing quote */
00640         for (;;) {
00641           ptr += MINBPC(enc);
00642           if (ptr == end)
00643             return XML_TOK_PARTIAL;
00644           switch (BYTE_TYPE(enc, ptr)) {
00645           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00646           case BT_S: case BT_CR: case BT_LF:
00647             continue;
00648           case BT_GT:
00649           gt:
00650             *nextTokPtr = ptr + MINBPC(enc);
00651             return XML_TOK_START_TAG_WITH_ATTS;
00652           case BT_SOL:
00653           sol:
00654             ptr += MINBPC(enc);
00655             if (ptr == end)
00656               return XML_TOK_PARTIAL;
00657             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00658               *nextTokPtr = ptr;
00659               return XML_TOK_INVALID;
00660             }
00661             *nextTokPtr = ptr + MINBPC(enc);
00662             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
00663           default:
00664             *nextTokPtr = ptr;
00665             return XML_TOK_INVALID;
00666           }
00667           break;
00668         }
00669         break;
00670       }
00671     default:
00672       *nextTokPtr = ptr;
00673       return XML_TOK_INVALID;
00674     }
00675   }
00676   return XML_TOK_PARTIAL;
00677 }
00678 
00679 /* ptr points to character following "<" */
00680 
00681 static
00682 int PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
00683                    const char **nextTokPtr)
00684 {
00685 #ifdef XML_NS
00686   int hadColon;
00687 #endif
00688   if (ptr == end)
00689     return XML_TOK_PARTIAL;
00690   switch (BYTE_TYPE(enc, ptr)) {
00691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00692   case BT_EXCL:
00693     if ((ptr += MINBPC(enc)) == end)
00694       return XML_TOK_PARTIAL;
00695     switch (BYTE_TYPE(enc, ptr)) {
00696     case BT_MINUS:
00697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00698     case BT_LSQB:
00699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00700     }
00701     *nextTokPtr = ptr;
00702     return XML_TOK_INVALID;
00703   case BT_QUEST:
00704     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00705   case BT_SOL:
00706     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00707   default:
00708     *nextTokPtr = ptr;
00709     return XML_TOK_INVALID;
00710   }
00711 #ifdef XML_NS
00712   hadColon = 0;
00713 #endif
00714   /* we have a start-tag */
00715   while (ptr != end) {
00716     switch (BYTE_TYPE(enc, ptr)) {
00717     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00718 #ifdef XML_NS
00719     case BT_COLON:
00720       if (hadColon) {
00721         *nextTokPtr = ptr;
00722         return XML_TOK_INVALID;
00723       }
00724       hadColon = 1;
00725       ptr += MINBPC(enc);
00726       if (ptr == end)
00727         return XML_TOK_PARTIAL;
00728       switch (BYTE_TYPE(enc, ptr)) {
00729       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00730       default:
00731         *nextTokPtr = ptr;
00732         return XML_TOK_INVALID;
00733       }
00734       break;
00735 #endif
00736     case BT_S: case BT_CR: case BT_LF:
00737       {
00738         ptr += MINBPC(enc);
00739         while (ptr != end) {
00740           switch (BYTE_TYPE(enc, ptr)) {
00741           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00742           case BT_GT:
00743             goto gt;
00744           case BT_SOL:
00745             goto sol;
00746           case BT_S: case BT_CR: case BT_LF:
00747             ptr += MINBPC(enc);
00748             continue;
00749           default:
00750             *nextTokPtr = ptr;
00751             return XML_TOK_INVALID;
00752           }
00753           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
00754         }
00755         return XML_TOK_PARTIAL;
00756       }
00757     case BT_GT:
00758     gt:
00759       *nextTokPtr = ptr + MINBPC(enc);
00760       return XML_TOK_START_TAG_NO_ATTS;
00761     case BT_SOL:
00762     sol:
00763       ptr += MINBPC(enc);
00764       if (ptr == end)
00765         return XML_TOK_PARTIAL;
00766       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00767         *nextTokPtr = ptr;
00768         return XML_TOK_INVALID;
00769       }
00770       *nextTokPtr = ptr + MINBPC(enc);
00771       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
00772     default:
00773       *nextTokPtr = ptr;
00774       return XML_TOK_INVALID;
00775     }
00776   }
00777   return XML_TOK_PARTIAL;
00778 }
00779 
00780 static
00781 int PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
00782                        const char **nextTokPtr)
00783 {
00784   if (ptr == end)
00785     return XML_TOK_NONE;
00786   if (MINBPC(enc) > 1) {
00787     size_t n = end - ptr;
00788     if (n & (MINBPC(enc) - 1)) {
00789       n &= ~(MINBPC(enc) - 1);
00790       if (n == 0)
00791         return XML_TOK_PARTIAL;
00792       end = ptr + n;
00793     }
00794   }
00795   switch (BYTE_TYPE(enc, ptr)) {
00796   case BT_LT:
00797     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00798   case BT_AMP:
00799     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00800   case BT_CR:
00801     ptr += MINBPC(enc);
00802     if (ptr == end)
00803       return XML_TOK_TRAILING_CR;
00804     if (BYTE_TYPE(enc, ptr) == BT_LF)
00805       ptr += MINBPC(enc);
00806     *nextTokPtr = ptr;
00807     return XML_TOK_DATA_NEWLINE;
00808   case BT_LF:
00809     *nextTokPtr = ptr + MINBPC(enc);
00810     return XML_TOK_DATA_NEWLINE;
00811   case BT_RSQB:
00812     ptr += MINBPC(enc);
00813     if (ptr == end)
00814       return XML_TOK_TRAILING_RSQB;
00815     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
00816       break;
00817     ptr += MINBPC(enc);
00818     if (ptr == end)
00819       return XML_TOK_TRAILING_RSQB;
00820     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
00821       ptr -= MINBPC(enc);
00822       break;
00823     }
00824     *nextTokPtr = ptr;
00825     return XML_TOK_INVALID;
00826   INVALID_CASES(ptr, nextTokPtr)
00827   default:
00828     ptr += MINBPC(enc);
00829     break;
00830   }
00831   while (ptr != end) {
00832     switch (BYTE_TYPE(enc, ptr)) {
00833 #define LEAD_CASE(n) \
00834     case BT_LEAD ## n: \
00835       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
00836         *nextTokPtr = ptr; \
00837         return XML_TOK_DATA_CHARS; \
00838       } \
00839       ptr += n; \
00840       break;
00841     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
00842 #undef LEAD_CASE
00843     case BT_RSQB:
00844       if (ptr + MINBPC(enc) != end) {
00845          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
00846            ptr += MINBPC(enc);
00847            break;
00848          }
00849          if (ptr + 2*MINBPC(enc) != end) {
00850            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
00851              ptr += MINBPC(enc);
00852              break;
00853            }
00854            *nextTokPtr = ptr + 2*MINBPC(enc);
00855            return XML_TOK_INVALID;
00856          }
00857       }
00858       /* fall through */
00859     case BT_AMP:
00860     case BT_LT:
00861     case BT_NONXML:
00862     case BT_MALFORM:
00863     case BT_TRAIL:
00864     case BT_CR:
00865     case BT_LF:
00866       *nextTokPtr = ptr;
00867       return XML_TOK_DATA_CHARS;
00868     default:
00869       ptr += MINBPC(enc);
00870       break;
00871     }
00872   }
00873   *nextTokPtr = ptr;
00874   return XML_TOK_DATA_CHARS;
00875 }
00876 
00877 /* ptr points to character following "%" */
00878 
00879 static
00880 int PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
00881                         const char **nextTokPtr)
00882 {
00883   if (ptr == end)
00884     return XML_TOK_PARTIAL;
00885   switch (BYTE_TYPE(enc, ptr)) {
00886   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00887   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
00888     *nextTokPtr = ptr;
00889     return XML_TOK_PERCENT;
00890   default:
00891     *nextTokPtr = ptr;
00892     return XML_TOK_INVALID;
00893   }
00894   while (ptr != end) {
00895     switch (BYTE_TYPE(enc, ptr)) {
00896     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00897     case BT_SEMI:
00898       *nextTokPtr = ptr + MINBPC(enc);
00899       return XML_TOK_PARAM_ENTITY_REF;
00900     default:
00901       *nextTokPtr = ptr;
00902       return XML_TOK_INVALID;
00903     }
00904   }
00905   return XML_TOK_PARTIAL;
00906 }
00907 
00908 static
00909 int PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
00910                           const char **nextTokPtr)
00911 {
00912   if (ptr == end)
00913     return XML_TOK_PARTIAL;
00914   switch (BYTE_TYPE(enc, ptr)) {
00915   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
00916   default:
00917     *nextTokPtr = ptr;
00918     return XML_TOK_INVALID;
00919   }
00920   while (ptr != end) {
00921     switch (BYTE_TYPE(enc, ptr)) {
00922     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
00923     case BT_CR: case BT_LF: case BT_S:
00924     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
00925       *nextTokPtr = ptr;
00926       return XML_TOK_POUND_NAME;
00927     default:
00928       *nextTokPtr = ptr;
00929       return XML_TOK_INVALID;
00930     }
00931   }
00932   return -XML_TOK_POUND_NAME;
00933 }
00934 
00935 static
00936 int PREFIX(scanLit)(int open, const ENCODING *enc,
00937                     const char *ptr, const char *end,
00938                     const char **nextTokPtr)
00939 {
00940   while (ptr != end) {
00941     int t = BYTE_TYPE(enc, ptr);
00942     switch (t) {
00943     INVALID_CASES(ptr, nextTokPtr)
00944     case BT_QUOT:
00945     case BT_APOS:
00946       ptr += MINBPC(enc);
00947       if (t != open)
00948         break;
00949       if (ptr == end)
00950         return -XML_TOK_LITERAL;
00951       *nextTokPtr = ptr;
00952       switch (BYTE_TYPE(enc, ptr)) {
00953       case BT_S: case BT_CR: case BT_LF:
00954       case BT_GT: case BT_PERCNT: case BT_LSQB:
00955         return XML_TOK_LITERAL;
00956       default:
00957         return XML_TOK_INVALID;
00958       }
00959     default:
00960       ptr += MINBPC(enc);
00961       break;
00962     }
00963   }
00964   return XML_TOK_PARTIAL;
00965 }
00966 
00967 static
00968 int PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
00969                       const char **nextTokPtr)
00970 {
00971   int tok;
00972   if (ptr == end)
00973     return XML_TOK_NONE;
00974   if (MINBPC(enc) > 1) {
00975     size_t n = end - ptr;
00976     if (n & (MINBPC(enc) - 1)) {
00977       n &= ~(MINBPC(enc) - 1);
00978       if (n == 0)
00979         return XML_TOK_PARTIAL;
00980       end = ptr + n;
00981     }
00982   }
00983   switch (BYTE_TYPE(enc, ptr)) {
00984   case BT_QUOT:
00985     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
00986   case BT_APOS:
00987     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
00988   case BT_LT:
00989     {
00990       ptr += MINBPC(enc);
00991       if (ptr == end)
00992         return XML_TOK_PARTIAL;
00993       switch (BYTE_TYPE(enc, ptr)) {
00994       case BT_EXCL:
00995         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00996       case BT_QUEST:
00997         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
00998       case BT_NMSTRT:
00999       case BT_HEX:
01000       case BT_NONASCII:
01001       case BT_LEAD2:
01002       case BT_LEAD3:
01003       case BT_LEAD4:
01004         *nextTokPtr = ptr - MINBPC(enc);
01005         return XML_TOK_INSTANCE_START;
01006       }
01007       *nextTokPtr = ptr;
01008       return XML_TOK_INVALID;
01009     }
01010   case BT_CR:
01011     if (ptr + MINBPC(enc) == end)
01012       return -XML_TOK_PROLOG_S;
01013     /* fall through */
01014   case BT_S: case BT_LF:
01015     for (;;) {
01016       ptr += MINBPC(enc);
01017       if (ptr == end)
01018         break;
01019       switch (BYTE_TYPE(enc, ptr)) {
01020       case BT_S: case BT_LF:
01021         break;
01022       case BT_CR:
01023         /* don't split CR/LF pair */
01024         if (ptr + MINBPC(enc) != end)
01025           break;
01026         /* fall through */
01027       default:
01028         *nextTokPtr = ptr;
01029         return XML_TOK_PROLOG_S;
01030       }
01031     }
01032     *nextTokPtr = ptr;
01033     return XML_TOK_PROLOG_S;
01034   case BT_PERCNT:
01035     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01036   case BT_COMMA:
01037     *nextTokPtr = ptr + MINBPC(enc);
01038     return XML_TOK_COMMA;
01039   case BT_LSQB:
01040     *nextTokPtr = ptr + MINBPC(enc);
01041     return XML_TOK_OPEN_BRACKET;
01042   case BT_RSQB:
01043     ptr += MINBPC(enc);
01044     if (ptr == end)
01045       return -XML_TOK_CLOSE_BRACKET;
01046     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
01047       if (ptr + MINBPC(enc) == end)
01048         return XML_TOK_PARTIAL;
01049       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
01050         *nextTokPtr = ptr + 2*MINBPC(enc);
01051         return XML_TOK_COND_SECT_CLOSE;
01052       }
01053     }
01054     *nextTokPtr = ptr;
01055     return XML_TOK_CLOSE_BRACKET;
01056   case BT_LPAR:
01057     *nextTokPtr = ptr + MINBPC(enc);
01058     return XML_TOK_OPEN_PAREN;
01059   case BT_RPAR:
01060     ptr += MINBPC(enc);
01061     if (ptr == end)
01062       return -XML_TOK_CLOSE_PAREN;
01063     switch (BYTE_TYPE(enc, ptr)) {
01064     case BT_AST:
01065       *nextTokPtr = ptr + MINBPC(enc);
01066       return XML_TOK_CLOSE_PAREN_ASTERISK;
01067     case BT_QUEST:
01068       *nextTokPtr = ptr + MINBPC(enc);
01069       return XML_TOK_CLOSE_PAREN_QUESTION;
01070     case BT_PLUS:
01071       *nextTokPtr = ptr + MINBPC(enc);
01072       return XML_TOK_CLOSE_PAREN_PLUS;
01073     case BT_CR: case BT_LF: case BT_S:
01074     case BT_GT: case BT_COMMA: case BT_VERBAR:
01075     case BT_RPAR:
01076       *nextTokPtr = ptr;
01077       return XML_TOK_CLOSE_PAREN;
01078     }
01079     *nextTokPtr = ptr;
01080     return XML_TOK_INVALID;
01081   case BT_VERBAR:
01082     *nextTokPtr = ptr + MINBPC(enc);
01083     return XML_TOK_OR;
01084   case BT_GT:
01085     *nextTokPtr = ptr + MINBPC(enc);
01086     return XML_TOK_DECL_CLOSE;
01087   case BT_NUM:
01088     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01089 #define LEAD_CASE(n) \
01090   case BT_LEAD ## n: \
01091     if (end - ptr < n) \
01092       return XML_TOK_PARTIAL_CHAR; \
01093     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
01094       ptr += n; \
01095       tok = XML_TOK_NAME; \
01096       break; \
01097     } \
01098     if (IS_NAME_CHAR(enc, ptr, n)) { \
01099       ptr += n; \
01100       tok = XML_TOK_NMTOKEN; \
01101       break; \
01102     } \
01103     *nextTokPtr = ptr; \
01104     return XML_TOK_INVALID;
01105     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01106 #undef LEAD_CASE
01107   case BT_NMSTRT:
01108   case BT_HEX:
01109     tok = XML_TOK_NAME;
01110     ptr += MINBPC(enc);
01111     break;
01112   case BT_DIGIT:
01113   case BT_NAME:
01114   case BT_MINUS:
01115 #ifdef XML_NS
01116   case BT_COLON:
01117 #endif
01118     tok = XML_TOK_NMTOKEN;
01119     ptr += MINBPC(enc);
01120     break;
01121   case BT_NONASCII:
01122     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
01123       ptr += MINBPC(enc);
01124       tok = XML_TOK_NAME;
01125       break;
01126     }
01127     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
01128       ptr += MINBPC(enc);
01129       tok = XML_TOK_NMTOKEN;
01130       break;
01131     }
01132     /* fall through */
01133   default:
01134     *nextTokPtr = ptr;
01135     return XML_TOK_INVALID;
01136   }
01137   while (ptr != end) {
01138     switch (BYTE_TYPE(enc, ptr)) {
01139     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01140     case BT_GT: case BT_RPAR: case BT_COMMA:
01141     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
01142     case BT_S: case BT_CR: case BT_LF:
01143       *nextTokPtr = ptr;
01144       return tok;
01145 #ifdef XML_NS
01146     case BT_COLON:
01147       ptr += MINBPC(enc);
01148       switch (tok) {
01149       case XML_TOK_NAME:
01150         if (ptr == end)
01151           return XML_TOK_PARTIAL;
01152         tok = XML_TOK_PREFIXED_NAME;
01153         switch (BYTE_TYPE(enc, ptr)) {
01154         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
01155         default:
01156           tok = XML_TOK_NMTOKEN;
01157           break;
01158         }
01159         break;
01160       case XML_TOK_PREFIXED_NAME:
01161         tok = XML_TOK_NMTOKEN;
01162         break;
01163       }
01164       break;
01165 #endif
01166     case BT_PLUS:
01167       if (tok == XML_TOK_NMTOKEN)  {
01168         *nextTokPtr = ptr;
01169         return XML_TOK_INVALID;
01170       }
01171       *nextTokPtr = ptr + MINBPC(enc);
01172       return XML_TOK_NAME_PLUS;
01173     case BT_AST:
01174       if (tok == XML_TOK_NMTOKEN)  {
01175         *nextTokPtr = ptr;
01176         return XML_TOK_INVALID;
01177       }
01178       *nextTokPtr = ptr + MINBPC(enc);
01179       return XML_TOK_NAME_ASTERISK;
01180     case BT_QUEST:
01181       if (tok == XML_TOK_NMTOKEN)  {
01182         *nextTokPtr = ptr;
01183         return XML_TOK_INVALID;
01184       }
01185       *nextTokPtr = ptr + MINBPC(enc);
01186       return XML_TOK_NAME_QUESTION;
01187     default:
01188       *nextTokPtr = ptr;
01189       return XML_TOK_INVALID;
01190     }
01191   }
01192   return -tok;
01193 }
01194 
01195 static
01196 int PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end,
01197                               const char **nextTokPtr)
01198 {
01199   const char *start;
01200   if (ptr == end)
01201     return XML_TOK_NONE;
01202   start = ptr;
01203   while (ptr != end) {
01204     switch (BYTE_TYPE(enc, ptr)) {
01205 #define LEAD_CASE(n) \
01206     case BT_LEAD ## n: ptr += n; break;
01207     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01208 #undef LEAD_CASE
01209     case BT_AMP:
01210       if (ptr == start)
01211         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01212       *nextTokPtr = ptr;
01213       return XML_TOK_DATA_CHARS;
01214     case BT_LT:
01215       /* this is for inside entity references */
01216       *nextTokPtr = ptr;
01217       return XML_TOK_INVALID;
01218     case BT_LF:
01219       if (ptr == start) {
01220         *nextTokPtr = ptr + MINBPC(enc);
01221         return XML_TOK_DATA_NEWLINE;
01222       }
01223       *nextTokPtr = ptr;
01224       return XML_TOK_DATA_CHARS;
01225     case BT_CR:
01226       if (ptr == start) {
01227         ptr += MINBPC(enc);
01228         if (ptr == end)
01229           return XML_TOK_TRAILING_CR;
01230         if (BYTE_TYPE(enc, ptr) == BT_LF)
01231           ptr += MINBPC(enc);
01232         *nextTokPtr = ptr;
01233         return XML_TOK_DATA_NEWLINE;
01234       }
01235       *nextTokPtr = ptr;
01236       return XML_TOK_DATA_CHARS;
01237     case BT_S:
01238       if (ptr == start) {
01239         *nextTokPtr = ptr + MINBPC(enc);
01240         return XML_TOK_ATTRIBUTE_VALUE_S;
01241       }
01242       *nextTokPtr = ptr;
01243       return XML_TOK_DATA_CHARS;
01244     default:
01245       ptr += MINBPC(enc);
01246       break;
01247     }
01248   }
01249   *nextTokPtr = ptr;
01250   return XML_TOK_DATA_CHARS;
01251 }
01252 
01253 static
01254 int PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end,
01255                            const char **nextTokPtr)
01256 {
01257   const char *start;
01258   if (ptr == end)
01259     return XML_TOK_NONE;
01260   start = ptr;
01261   while (ptr != end) {
01262     switch (BYTE_TYPE(enc, ptr)) {
01263 #define LEAD_CASE(n) \
01264     case BT_LEAD ## n: ptr += n; break;
01265     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01266 #undef LEAD_CASE
01267     case BT_AMP:
01268       if (ptr == start)
01269         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
01270       *nextTokPtr = ptr;
01271       return XML_TOK_DATA_CHARS;
01272     case BT_PERCNT:
01273       if (ptr == start) {
01274         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
01275                                        end, nextTokPtr);
01276         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
01277       }
01278       *nextTokPtr = ptr;
01279       return XML_TOK_DATA_CHARS;
01280     case BT_LF:
01281       if (ptr == start) {
01282         *nextTokPtr = ptr + MINBPC(enc);
01283         return XML_TOK_DATA_NEWLINE;
01284       }
01285       *nextTokPtr = ptr;
01286       return XML_TOK_DATA_CHARS;
01287     case BT_CR:
01288       if (ptr == start) {
01289         ptr += MINBPC(enc);
01290         if (ptr == end)
01291           return XML_TOK_TRAILING_CR;
01292         if (BYTE_TYPE(enc, ptr) == BT_LF)
01293           ptr += MINBPC(enc);
01294         *nextTokPtr = ptr;
01295         return XML_TOK_DATA_NEWLINE;
01296       }
01297       *nextTokPtr = ptr;
01298       return XML_TOK_DATA_CHARS;
01299     default:
01300       ptr += MINBPC(enc);
01301       break;
01302     }
01303   }
01304   *nextTokPtr = ptr;
01305   return XML_TOK_DATA_CHARS;
01306 }
01307 
01308 #ifdef XML_DTD
01309 
01310 static
01311 int PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr, const char *end,
01312                              const char **nextTokPtr)
01313 {
01314   int level = 0;
01315   if (MINBPC(enc) > 1) {
01316     size_t n = end - ptr;
01317     if (n & (MINBPC(enc) - 1)) {
01318       n &= ~(MINBPC(enc) - 1);
01319       end = ptr + n;
01320     }
01321   }
01322   while (ptr != end) {
01323     switch (BYTE_TYPE(enc, ptr)) {
01324     INVALID_CASES(ptr, nextTokPtr)
01325     case BT_LT:
01326       if ((ptr += MINBPC(enc)) == end)
01327         return XML_TOK_PARTIAL;
01328       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
01329         if ((ptr += MINBPC(enc)) == end)
01330           return XML_TOK_PARTIAL;
01331         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
01332           ++level;
01333           ptr += MINBPC(enc);
01334         }
01335       }
01336       break;
01337     case BT_RSQB:
01338       if ((ptr += MINBPC(enc)) == end)
01339         return XML_TOK_PARTIAL;
01340       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
01341         if ((ptr += MINBPC(enc)) == end)
01342           return XML_TOK_PARTIAL;
01343         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
01344           ptr += MINBPC(enc);
01345           if (level == 0) {
01346             *nextTokPtr = ptr;
01347             return XML_TOK_IGNORE_SECT;
01348           }
01349           --level;
01350         }
01351       }
01352       break;
01353     default:
01354       ptr += MINBPC(enc);
01355       break;
01356     }
01357   }
01358   return XML_TOK_PARTIAL;
01359 }
01360 
01361 #endif /* XML_DTD */
01362 
01363 static
01364 int PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
01365                        const char **badPtr)
01366 {
01367   ptr += MINBPC(enc);
01368   end -= MINBPC(enc);
01369   for (; ptr != end; ptr += MINBPC(enc)) {
01370     switch (BYTE_TYPE(enc, ptr)) {
01371     case BT_DIGIT:
01372     case BT_HEX:
01373     case BT_MINUS:
01374     case BT_APOS:
01375     case BT_LPAR:
01376     case BT_RPAR:
01377     case BT_PLUS:
01378     case BT_COMMA:
01379     case BT_SOL:
01380     case BT_EQUALS:
01381     case BT_QUEST:
01382     case BT_CR:
01383     case BT_LF:
01384     case BT_SEMI:
01385     case BT_EXCL:
01386     case BT_AST:
01387     case BT_PERCNT:
01388     case BT_NUM:
01389 #ifdef XML_NS
01390     case BT_COLON:
01391 #endif
01392       break;
01393     case BT_S:
01394       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
01395         *badPtr = ptr;
01396         return 0;
01397       }
01398       break;
01399     case BT_NAME:
01400     case BT_NMSTRT:
01401       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
01402         break;
01403     default:
01404       switch (BYTE_TO_ASCII(enc, ptr)) {
01405       case 0x24: /* $ */
01406       case 0x40: /* @ */
01407         break;
01408       default:
01409         *badPtr = ptr;
01410         return 0;
01411       }
01412       break;
01413     }
01414   }
01415   return 1;
01416 }
01417 
01418 /* This must only be called for a well-formed start-tag or empty element tag.
01419 Returns the number of attributes.  Pointers to the first attsMax attributes 
01420 are stored in atts. */
01421 
01422 static
01423 int PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
01424                     int attsMax, ATTRIBUTE *atts)
01425 {
01426   enum { other, inName, inValue } state = inName;
01427   int nAtts = 0;
01428   int open = 0; /* defined when state == inValue;
01429                    initialization just to shut up compilers */
01430 
01431   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
01432     switch (BYTE_TYPE(enc, ptr)) {
01433 #define START_NAME \
01434       if (state == other) { \
01435         if (nAtts < attsMax) { \
01436           atts[nAtts].name = ptr; \
01437           atts[nAtts].normalized = 1; \
01438         } \
01439         state = inName; \
01440       }
01441 #define LEAD_CASE(n) \
01442     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
01443     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01444 #undef LEAD_CASE
01445     case BT_NONASCII:
01446     case BT_NMSTRT:
01447     case BT_HEX:
01448       START_NAME
01449       break;
01450 #undef START_NAME
01451     case BT_QUOT:
01452       if (state != inValue) {
01453         if (nAtts < attsMax)
01454           atts[nAtts].valuePtr = ptr + MINBPC(enc);
01455         state = inValue;
01456         open = BT_QUOT;
01457       }
01458       else if (open == BT_QUOT) {
01459         state = other;
01460         if (nAtts < attsMax)
01461           atts[nAtts].valueEnd = ptr;
01462         nAtts++;
01463       }
01464       break;
01465     case BT_APOS:
01466       if (state != inValue) {
01467         if (nAtts < attsMax)
01468           atts[nAtts].valuePtr = ptr + MINBPC(enc);
01469         state = inValue;
01470         open = BT_APOS;
01471       }
01472       else if (open == BT_APOS) {
01473         state = other;
01474         if (nAtts < attsMax)
01475           atts[nAtts].valueEnd = ptr;
01476         nAtts++;
01477       }
01478       break;
01479     case BT_AMP:
01480       if (nAtts < attsMax)
01481         atts[nAtts].normalized = 0;
01482       break;
01483     case BT_S:
01484       if (state == inName)
01485         state = other;
01486       else if (state == inValue
01487                && nAtts < attsMax
01488                && atts[nAtts].normalized
01489                && (ptr == atts[nAtts].valuePtr
01490                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
01491                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
01492                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
01493         atts[nAtts].normalized = 0;
01494       break;
01495     case BT_CR: case BT_LF:
01496       /* This case ensures that the first attribute name is counted
01497          Apart from that we could just change state on the quote. */
01498       if (state == inName)
01499         state = other;
01500       else if (state == inValue && nAtts < attsMax)
01501         atts[nAtts].normalized = 0;
01502       break;
01503     case BT_GT:
01504     case BT_SOL:
01505       if (state != inValue)
01506         return nAtts;
01507       break;
01508     default:
01509       break;
01510     }
01511   }
01512   /* not reached */
01513 }
01514 
01515 static
01516 int PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
01517 {
01518   int result = 0;
01519   /* skip &# */
01520   ptr += 2*MINBPC(enc);
01521   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
01522     for (ptr += MINBPC(enc); !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
01523       int c = BYTE_TO_ASCII(enc, ptr);
01524       switch (c) {
01525       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
01526       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
01527         result <<= 4;
01528         result |= (c - ASCII_0);
01529         break;
01530       case ASCII_A: case ASCII_B: case ASCII_C: case ASCII_D: case ASCII_E: case ASCII_F:
01531         result <<= 4;
01532         result += 10 + (c - ASCII_A);
01533         break;
01534       case ASCII_a: case ASCII_b: case ASCII_c: case ASCII_d: case ASCII_e: case ASCII_f:
01535         result <<= 4;
01536         result += 10 + (c - ASCII_a);
01537         break;
01538       }
01539       if (result >= 0x110000)
01540         return -1;
01541     }
01542   }
01543   else {
01544     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
01545       int c = BYTE_TO_ASCII(enc, ptr);
01546       result *= 10;
01547       result += (c - ASCII_0);
01548       if (result >= 0x110000)
01549         return -1;
01550     }
01551   }
01552   return checkCharRefNumber(result);
01553 }
01554 
01555 static
01556 int PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr, const char *end)
01557 {
01558   switch ((end - ptr)/MINBPC(enc)) {
01559   case 2:
01560     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
01561       switch (BYTE_TO_ASCII(enc, ptr)) {
01562       case ASCII_l:
01563         return ASCII_LT;
01564       case ASCII_g:
01565         return ASCII_GT;
01566       }
01567     }
01568     break;
01569   case 3:
01570     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
01571       ptr += MINBPC(enc);
01572       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
01573         ptr += MINBPC(enc);
01574         if (CHAR_MATCHES(enc, ptr, ASCII_p))
01575           return ASCII_AMP;
01576       }
01577     }
01578     break;
01579   case 4:
01580     switch (BYTE_TO_ASCII(enc, ptr)) {
01581     case ASCII_q:
01582       ptr += MINBPC(enc);
01583       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
01584         ptr += MINBPC(enc);
01585         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
01586           ptr += MINBPC(enc);
01587           if (CHAR_MATCHES(enc, ptr, ASCII_t))
01588             return ASCII_QUOT;
01589         }
01590       }
01591       break;
01592     case ASCII_a:
01593       ptr += MINBPC(enc);
01594       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
01595         ptr += MINBPC(enc);
01596         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
01597           ptr += MINBPC(enc);
01598           if (CHAR_MATCHES(enc, ptr, ASCII_s))
01599             return ASCII_APOS;
01600         }
01601       }
01602       break;
01603     }
01604   }
01605   return 0;
01606 }
01607 
01608 static
01609 int PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
01610 {
01611   for (;;) {
01612     switch (BYTE_TYPE(enc, ptr1)) {
01613 #define LEAD_CASE(n) \
01614     case BT_LEAD ## n: \
01615       if (*ptr1++ != *ptr2++) \
01616         return 0;
01617     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
01618 #undef LEAD_CASE
01619       /* fall through */
01620       if (*ptr1++ != *ptr2++)
01621         return 0;
01622       break;
01623     case BT_NONASCII:
01624     case BT_NMSTRT:
01625 #ifdef XML_NS
01626     case BT_COLON:
01627 #endif
01628     case BT_HEX:
01629     case BT_DIGIT:
01630     case BT_NAME:
01631     case BT_MINUS:
01632       if (*ptr2++ != *ptr1++)
01633         return 0;
01634       if (MINBPC(enc) > 1) {
01635         if (*ptr2++ != *ptr1++)
01636           return 0;
01637         if (MINBPC(enc) > 2) {
01638           if (*ptr2++ != *ptr1++)
01639             return 0;
01640           if (MINBPC(enc) > 3) {
01641             if (*ptr2++ != *ptr1++)
01642               return 0;
01643           }
01644         }
01645       }
01646       break;
01647     default:
01648       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
01649         return 1;
01650       switch (BYTE_TYPE(enc, ptr2)) {
01651       case BT_LEAD2:
01652       case BT_LEAD3:
01653       case BT_LEAD4:
01654       case BT_NONASCII:
01655       case BT_NMSTRT:
01656 #ifdef XML_NS
01657       case BT_COLON:
01658 #endif
01659       case BT_HEX:
01660       case BT_DIGIT:
01661       case BT_NAME:
01662       case BT_MINUS:
01663         return 0;
01664       default:
01665         return 1;
01666       }
01667     }
01668   }
01669   /* not reached */
01670 }
01671 
01672 static
01673 int PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
01674                              const char *end1, const char *ptr2)
01675 {
01676   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
01677     if (ptr1 == end1)
01678       return 0;
01679     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
01680       return 0;
01681   }
01682   return ptr1 == end1;
01683 }
01684 
01685 static
01686 int PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
01687 {
01688   const char *start = ptr;
01689   for (;;) {
01690     switch (BYTE_TYPE(enc, ptr)) {
01691 #define LEAD_CASE(n) \
01692     case BT_LEAD ## n: ptr += n; break;
01693     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01694 #undef LEAD_CASE
01695     case BT_NONASCII:
01696     case BT_NMSTRT:
01697 #ifdef XML_NS
01698     case BT_COLON:
01699 #endif
01700     case BT_HEX:
01701     case BT_DIGIT:
01702     case BT_NAME:
01703     case BT_MINUS:
01704       ptr += MINBPC(enc);
01705       break;
01706     default:
01707       return ptr - start;
01708     }
01709   }
01710 }
01711 
01712 static
01713 const char *PREFIX(skipS)(const ENCODING *enc, const char *ptr)
01714 {
01715   for (;;) {
01716     switch (BYTE_TYPE(enc, ptr)) {
01717     case BT_LF:
01718     case BT_CR:
01719     case BT_S:
01720       ptr += MINBPC(enc);
01721       break;
01722     default:
01723       return ptr;
01724     }
01725   }
01726 }
01727 
01728 static
01729 void PREFIX(updatePosition)(const ENCODING *enc,
01730                             const char *ptr,
01731                             const char *end,
01732                             POSITION *pos)
01733 {
01734   while (ptr != end) {
01735     switch (BYTE_TYPE(enc, ptr)) {
01736 #define LEAD_CASE(n) \
01737     case BT_LEAD ## n: \
01738       ptr += n; \
01739       break;
01740     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
01741 #undef LEAD_CASE
01742     case BT_LF:
01743       pos->columnNumber = (unsigned)-1;
01744       pos->lineNumber++;
01745       ptr += MINBPC(enc);
01746       break;
01747     case BT_CR:
01748       pos->lineNumber++;
01749       ptr += MINBPC(enc);
01750       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
01751         ptr += MINBPC(enc);
01752       pos->columnNumber = (unsigned)-1;
01753       break;
01754     default:
01755       ptr += MINBPC(enc);
01756       break;
01757     }
01758     pos->columnNumber++;
01759   }
01760 }
01761 
01762 #undef DO_LEAD_CASE
01763 #undef MULTIBYTE_CASES
01764 #undef INVALID_CASES
01765 #undef CHECK_NAME_CASE
01766 #undef CHECK_NAME_CASES
01767 #undef CHECK_NMSTRT_CASE
01768 #undef CHECK_NMSTRT_CASES


Last updated at Tue Dec 18 21:07:42 PST 2007. This site and project hosted by...SourceForge.net Logo
Source Perspective by Fisheye