00001
00002
00003
00004
00005
00006
00007 #ifdef COMPILED_FROM_DSP
00008 # include "winconfig.h"
00009 #else
00010 # include <config.h>
00011 #endif
00012
00013 #include "xmltok.h"
00014 #include "nametab.h"
00015
00016 #ifdef XML_DTD
00017 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
00018 #else
00019 #define IGNORE_SECTION_TOK_VTABLE
00020 #endif
00021
00022 #define VTABLE1 \
00023 { PREFIX(prologTok), PREFIX(contentTok), \
00024 PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
00025 { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00026 PREFIX(sameName), \
00027 PREFIX(nameMatchesAscii), \
00028 PREFIX(nameLength), \
00029 PREFIX(skipS), \
00030 PREFIX(getAtts), \
00031 PREFIX(charRefNumber), \
00032 PREFIX(predefinedEntityName), \
00033 PREFIX(updatePosition), \
00034 PREFIX(isPublicId)
00035
00036 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00037
00038 #define UCS2_GET_NAMING(pages, hi, lo) \
00039 (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00040
00041
00042
00043
00044
00045 #define UTF8_GET_NAMING2(pages, byte) \
00046 (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00047 + ((((byte)[0]) & 3) << 1) \
00048 + ((((byte)[1]) >> 5) & 1)] \
00049 & (1 << (((byte)[1]) & 0x1F)))
00050
00051
00052
00053
00054
00055 #define UTF8_GET_NAMING3(pages, byte) \
00056 (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00057 + ((((byte)[1]) >> 2) & 0xF)] \
00058 << 3) \
00059 + ((((byte)[1]) & 3) << 1) \
00060 + ((((byte)[2]) >> 5) & 1)] \
00061 & (1 << (((byte)[2]) & 0x1F)))
00062
00063 #define UTF8_GET_NAMING(pages, p, n) \
00064 ((n) == 2 \
00065 ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00066 : ((n) == 3 \
00067 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00068 : 0))
00069
00070 #define UTF8_INVALID3(p) \
00071 ((*p) == 0xED \
00072 ? (((p)[1] & 0x20) != 0) \
00073 : ((*p) == 0xEF \
00074 ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
00075 : 0))
00076
00077 #define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
00078
00079 static
00080 int isNever(const ENCODING *enc, const char *p)
00081 {
00082 return 0;
00083 }
00084
00085 static
00086 int utf8_isName2(const ENCODING *enc, const char *p)
00087 {
00088 return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00089 }
00090
00091 static
00092 int utf8_isName3(const ENCODING *enc, const char *p)
00093 {
00094 return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00095 }
00096
00097 #define utf8_isName4 isNever
00098
00099 static
00100 int utf8_isNmstrt2(const ENCODING *enc, const char *p)
00101 {
00102 return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00103 }
00104
00105 static
00106 int utf8_isNmstrt3(const ENCODING *enc, const char *p)
00107 {
00108 return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00109 }
00110
00111 #define utf8_isNmstrt4 isNever
00112
00113 #define utf8_isInvalid2 isNever
00114
00115 static
00116 int utf8_isInvalid3(const ENCODING *enc, const char *p)
00117 {
00118 return UTF8_INVALID3((const unsigned char *)p);
00119 }
00120
00121 static
00122 int utf8_isInvalid4(const ENCODING *enc, const char *p)
00123 {
00124 return UTF8_INVALID4((const unsigned char *)p);
00125 }
00126
00127 struct normal_encoding {
00128 ENCODING enc;
00129 unsigned char type[256];
00130 #ifdef XML_MIN_SIZE
00131 int (*byteType)(const ENCODING *, const char *);
00132 int (*isNameMin)(const ENCODING *, const char *);
00133 int (*isNmstrtMin)(const ENCODING *, const char *);
00134 int (*byteToAscii)(const ENCODING *, const char *);
00135 int (*charMatches)(const ENCODING *, const char *, int);
00136 #endif
00137 int (*isName2)(const ENCODING *, const char *);
00138 int (*isName3)(const ENCODING *, const char *);
00139 int (*isName4)(const ENCODING *, const char *);
00140 int (*isNmstrt2)(const ENCODING *, const char *);
00141 int (*isNmstrt3)(const ENCODING *, const char *);
00142 int (*isNmstrt4)(const ENCODING *, const char *);
00143 int (*isInvalid2)(const ENCODING *, const char *);
00144 int (*isInvalid3)(const ENCODING *, const char *);
00145 int (*isInvalid4)(const ENCODING *, const char *);
00146 };
00147
00148 #ifdef XML_MIN_SIZE
00149
00150 #define STANDARD_VTABLE(E) \
00151 E ## byteType, \
00152 E ## isNameMin, \
00153 E ## isNmstrtMin, \
00154 E ## byteToAscii, \
00155 E ## charMatches,
00156
00157 #else
00158
00159 #define STANDARD_VTABLE(E)
00160
00161 #endif
00162
00163 #define NORMAL_VTABLE(E) \
00164 E ## isName2, \
00165 E ## isName3, \
00166 E ## isName4, \
00167 E ## isNmstrt2, \
00168 E ## isNmstrt3, \
00169 E ## isNmstrt4, \
00170 E ## isInvalid2, \
00171 E ## isInvalid3, \
00172 E ## isInvalid4
00173
00174 static int checkCharRefNumber(int);
00175
00176 #include "xmltok_impl.h"
00177 #include "ascii.h"
00178
00179 #ifdef XML_MIN_SIZE
00180 #define sb_isNameMin isNever
00181 #define sb_isNmstrtMin isNever
00182 #endif
00183
00184 #ifdef XML_MIN_SIZE
00185 #define MINBPC(enc) ((enc)->minBytesPerChar)
00186 #else
00187
00188 #define MINBPC(enc) 1
00189 #endif
00190
00191 #define SB_BYTE_TYPE(enc, p) \
00192 (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00193
00194 #ifdef XML_MIN_SIZE
00195 static
00196 int sb_byteType(const ENCODING *enc, const char *p)
00197 {
00198 return SB_BYTE_TYPE(enc, p);
00199 }
00200 #define BYTE_TYPE(enc, p) \
00201 (((const struct normal_encoding *)(enc))->byteType(enc, p))
00202 #else
00203 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00204 #endif
00205
00206 #ifdef XML_MIN_SIZE
00207 #define BYTE_TO_ASCII(enc, p) \
00208 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
00209 static
00210 int sb_byteToAscii(const ENCODING *enc, const char *p)
00211 {
00212 return *p;
00213 }
00214 #else
00215 #define BYTE_TO_ASCII(enc, p) (*(p))
00216 #endif
00217
00218 #define IS_NAME_CHAR(enc, p, n) \
00219 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
00220 #define IS_NMSTRT_CHAR(enc, p, n) \
00221 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
00222 #define IS_INVALID_CHAR(enc, p, n) \
00223 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
00224
00225 #ifdef XML_MIN_SIZE
00226 #define IS_NAME_CHAR_MINBPC(enc, p) \
00227 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
00228 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00229 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
00230 #else
00231 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00232 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00233 #endif
00234
00235 #ifdef XML_MIN_SIZE
00236 #define CHAR_MATCHES(enc, p, c) \
00237 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
00238 static
00239 int sb_charMatches(const ENCODING *enc, const char *p, int c)
00240 {
00241 return *p == c;
00242 }
00243 #else
00244
00245 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00246 #endif
00247
00248 #define PREFIX(ident) normal_ ## ident
00249 #include "xmltok_impl.c"
00250
00251 #undef MINBPC
00252 #undef BYTE_TYPE
00253 #undef BYTE_TO_ASCII
00254 #undef CHAR_MATCHES
00255 #undef IS_NAME_CHAR
00256 #undef IS_NAME_CHAR_MINBPC
00257 #undef IS_NMSTRT_CHAR
00258 #undef IS_NMSTRT_CHAR_MINBPC
00259 #undef IS_INVALID_CHAR
00260
00261 enum {
00262 UTF8_cval1 = 0x00,
00263 UTF8_cval2 = 0xc0,
00264 UTF8_cval3 = 0xe0,
00265 UTF8_cval4 = 0xf0
00266 };
00267
00268 static
00269 void utf8_toUtf8(const ENCODING *enc,
00270 const char **fromP, const char *fromLim,
00271 char **toP, const char *toLim)
00272 {
00273 char *to;
00274 const char *from;
00275 if (fromLim - *fromP > toLim - *toP) {
00276
00277 for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00278 if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00279 break;
00280 }
00281 for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00282 *to = *from;
00283 *fromP = from;
00284 *toP = to;
00285 }
00286
00287 static
00288 void utf8_toUtf16(const ENCODING *enc,
00289 const char **fromP, const char *fromLim,
00290 unsigned short **toP, const unsigned short *toLim)
00291 {
00292 unsigned short *to = *toP;
00293 const char *from = *fromP;
00294 while (from != fromLim && to != toLim) {
00295 switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00296 case BT_LEAD2:
00297 *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
00298 from += 2;
00299 break;
00300 case BT_LEAD3:
00301 *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
00302 from += 3;
00303 break;
00304 case BT_LEAD4:
00305 {
00306 unsigned long n;
00307 if (to + 1 == toLim)
00308 break;
00309 n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00310 n -= 0x10000;
00311 to[0] = (unsigned short)((n >> 10) | 0xD800);
00312 to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00313 to += 2;
00314 from += 4;
00315 }
00316 break;
00317 default:
00318 *to++ = *from++;
00319 break;
00320 }
00321 }
00322 *fromP = from;
00323 *toP = to;
00324 }
00325
00326 #ifdef XML_NS
00327 static const struct normal_encoding utf8_encoding_ns = {
00328 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00329 {
00330 #include "asciitab.h"
00331 #include "utf8tab.h"
00332 },
00333 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00334 };
00335 #endif
00336
00337 static const struct normal_encoding utf8_encoding = {
00338 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00339 {
00340 #define BT_COLON BT_NMSTRT
00341 #include "asciitab.h"
00342 #undef BT_COLON
00343 #include "utf8tab.h"
00344 },
00345 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00346 };
00347
00348 #ifdef XML_NS
00349
00350 static const struct normal_encoding internal_utf8_encoding_ns = {
00351 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00352 {
00353 #include "iasciitab.h"
00354 #include "utf8tab.h"
00355 },
00356 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00357 };
00358
00359 #endif
00360
00361 static const struct normal_encoding internal_utf8_encoding = {
00362 { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00363 {
00364 #define BT_COLON BT_NMSTRT
00365 #include "iasciitab.h"
00366 #undef BT_COLON
00367 #include "utf8tab.h"
00368 },
00369 STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00370 };
00371
00372 static
00373 void latin1_toUtf8(const ENCODING *enc,
00374 const char **fromP, const char *fromLim,
00375 char **toP, const char *toLim)
00376 {
00377 for (;;) {
00378 unsigned char c;
00379 if (*fromP == fromLim)
00380 break;
00381 c = (unsigned char)**fromP;
00382 if (c & 0x80) {
00383 if (toLim - *toP < 2)
00384 break;
00385 *(*toP)++ = ((c >> 6) | UTF8_cval2);
00386 *(*toP)++ = ((c & 0x3f) | 0x80);
00387 (*fromP)++;
00388 }
00389 else {
00390 if (*toP == toLim)
00391 break;
00392 *(*toP)++ = *(*fromP)++;
00393 }
00394 }
00395 }
00396
00397 static
00398 void latin1_toUtf16(const ENCODING *enc,
00399 const char **fromP, const char *fromLim,
00400 unsigned short **toP, const unsigned short *toLim)
00401 {
00402 while (*fromP != fromLim && *toP != toLim)
00403 *(*toP)++ = (unsigned char)*(*fromP)++;
00404 }
00405
00406 #ifdef XML_NS
00407
00408 static const struct normal_encoding latin1_encoding_ns = {
00409 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00410 {
00411 #include "asciitab.h"
00412 #include "latin1tab.h"
00413 },
00414 STANDARD_VTABLE(sb_)
00415 };
00416
00417 #endif
00418
00419 static const struct normal_encoding latin1_encoding = {
00420 { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00421 {
00422 #define BT_COLON BT_NMSTRT
00423 #include "asciitab.h"
00424 #undef BT_COLON
00425 #include "latin1tab.h"
00426 },
00427 STANDARD_VTABLE(sb_)
00428 };
00429
00430 static
00431 void ascii_toUtf8(const ENCODING *enc,
00432 const char **fromP, const char *fromLim,
00433 char **toP, const char *toLim)
00434 {
00435 while (*fromP != fromLim && *toP != toLim)
00436 *(*toP)++ = *(*fromP)++;
00437 }
00438
00439 #ifdef XML_NS
00440
00441 static const struct normal_encoding ascii_encoding_ns = {
00442 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00443 {
00444 #include "asciitab.h"
00445
00446 },
00447 STANDARD_VTABLE(sb_)
00448 };
00449
00450 #endif
00451
00452 static const struct normal_encoding ascii_encoding = {
00453 { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00454 {
00455 #define BT_COLON BT_NMSTRT
00456 #include "asciitab.h"
00457 #undef BT_COLON
00458
00459 },
00460 STANDARD_VTABLE(sb_)
00461 };
00462
00463 static int unicode_byte_type(char hi, char lo)
00464 {
00465 switch ((unsigned char)hi) {
00466 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00467 return BT_LEAD4;
00468 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00469 return BT_TRAIL;
00470 case 0xFF:
00471 switch ((unsigned char)lo) {
00472 case 0xFF:
00473 case 0xFE:
00474 return BT_NONXML;
00475 }
00476 break;
00477 }
00478 return BT_NONASCII;
00479 }
00480
00481 #define DEFINE_UTF16_TO_UTF8(E) \
00482 static \
00483 void E ## toUtf8(const ENCODING *enc, \
00484 const char **fromP, const char *fromLim, \
00485 char **toP, const char *toLim) \
00486 { \
00487 const char *from; \
00488 for (from = *fromP; from != fromLim; from += 2) { \
00489 int plane; \
00490 unsigned char lo2; \
00491 unsigned char lo = GET_LO(from); \
00492 unsigned char hi = GET_HI(from); \
00493 switch (hi) { \
00494 case 0: \
00495 if (lo < 0x80) { \
00496 if (*toP == toLim) { \
00497 *fromP = from; \
00498 return; \
00499 } \
00500 *(*toP)++ = lo; \
00501 break; \
00502 } \
00503 \
00504 case 0x1: case 0x2: case 0x3: \
00505 case 0x4: case 0x5: case 0x6: case 0x7: \
00506 if (toLim - *toP < 2) { \
00507 *fromP = from; \
00508 return; \
00509 } \
00510 *(*toP)++ = ((lo >> 6) | (hi << 2) | UTF8_cval2); \
00511 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00512 break; \
00513 default: \
00514 if (toLim - *toP < 3) { \
00515 *fromP = from; \
00516 return; \
00517 } \
00518 \
00519 *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00520 *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00521 *(*toP)++ = ((lo & 0x3f) | 0x80); \
00522 break; \
00523 case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00524 if (toLim - *toP < 4) { \
00525 *fromP = from; \
00526 return; \
00527 } \
00528 plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00529 *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00530 *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00531 from += 2; \
00532 lo2 = GET_LO(from); \
00533 *(*toP)++ = (((lo & 0x3) << 4) \
00534 | ((GET_HI(from) & 0x3) << 2) \
00535 | (lo2 >> 6) \
00536 | 0x80); \
00537 *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00538 break; \
00539 } \
00540 } \
00541 *fromP = from; \
00542 }
00543
00544 #define DEFINE_UTF16_TO_UTF16(E) \
00545 static \
00546 void E ## toUtf16(const ENCODING *enc, \
00547 const char **fromP, const char *fromLim, \
00548 unsigned short **toP, const unsigned short *toLim) \
00549 { \
00550 \
00551 if (fromLim - *fromP > ((toLim - *toP) << 1) \
00552 && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00553 fromLim -= 2; \
00554 for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00555 *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00556 }
00557
00558 #define SET2(ptr, ch) \
00559 (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00560 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00561 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00562
00563 DEFINE_UTF16_TO_UTF8(little2_)
00564 DEFINE_UTF16_TO_UTF16(little2_)
00565
00566 #undef SET2
00567 #undef GET_LO
00568 #undef GET_HI
00569
00570 #define SET2(ptr, ch) \
00571 (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00572 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00573 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00574
00575 DEFINE_UTF16_TO_UTF8(big2_)
00576 DEFINE_UTF16_TO_UTF16(big2_)
00577
00578 #undef SET2
00579 #undef GET_LO
00580 #undef GET_HI
00581
00582 #define LITTLE2_BYTE_TYPE(enc, p) \
00583 ((p)[1] == 0 \
00584 ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00585 : unicode_byte_type((p)[1], (p)[0]))
00586 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00587 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00588 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00589 UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00590 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00591 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00592
00593 #ifdef XML_MIN_SIZE
00594
00595 static
00596 int little2_byteType(const ENCODING *enc, const char *p)
00597 {
00598 return LITTLE2_BYTE_TYPE(enc, p);
00599 }
00600
00601 static
00602 int little2_byteToAscii(const ENCODING *enc, const char *p)
00603 {
00604 return LITTLE2_BYTE_TO_ASCII(enc, p);
00605 }
00606
00607 static
00608 int little2_charMatches(const ENCODING *enc, const char *p, int c)
00609 {
00610 return LITTLE2_CHAR_MATCHES(enc, p, c);
00611 }
00612
00613 static
00614 int little2_isNameMin(const ENCODING *enc, const char *p)
00615 {
00616 return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00617 }
00618
00619 static
00620 int little2_isNmstrtMin(const ENCODING *enc, const char *p)
00621 {
00622 return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00623 }
00624
00625 #undef VTABLE
00626 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00627
00628 #else
00629
00630 #undef PREFIX
00631 #define PREFIX(ident) little2_ ## ident
00632 #define MINBPC(enc) 2
00633
00634 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00635 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00636 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00637 #define IS_NAME_CHAR(enc, p, n) 0
00638 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00639 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00640 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00641
00642 #include "xmltok_impl.c"
00643
00644 #undef MINBPC
00645 #undef BYTE_TYPE
00646 #undef BYTE_TO_ASCII
00647 #undef CHAR_MATCHES
00648 #undef IS_NAME_CHAR
00649 #undef IS_NAME_CHAR_MINBPC
00650 #undef IS_NMSTRT_CHAR
00651 #undef IS_NMSTRT_CHAR_MINBPC
00652 #undef IS_INVALID_CHAR
00653
00654 #endif
00655
00656 #ifdef XML_NS
00657
00658 static const struct normal_encoding little2_encoding_ns = {
00659 { VTABLE, 2, 0,
00660 #if XML_BYTE_ORDER == 12
00661 1
00662 #else
00663 0
00664 #endif
00665 },
00666 {
00667 #include "asciitab.h"
00668 #include "latin1tab.h"
00669 },
00670 STANDARD_VTABLE(little2_)
00671 };
00672
00673 #endif
00674
00675 static const struct normal_encoding little2_encoding = {
00676 { VTABLE, 2, 0,
00677 #if XML_BYTE_ORDER == 12
00678 1
00679 #else
00680 0
00681 #endif
00682 },
00683 {
00684 #define BT_COLON BT_NMSTRT
00685 #include "asciitab.h"
00686 #undef BT_COLON
00687 #include "latin1tab.h"
00688 },
00689 STANDARD_VTABLE(little2_)
00690 };
00691
00692 #if XML_BYTE_ORDER != 21
00693
00694 #ifdef XML_NS
00695
00696 static const struct normal_encoding internal_little2_encoding_ns = {
00697 { VTABLE, 2, 0, 1 },
00698 {
00699 #include "iasciitab.h"
00700 #include "latin1tab.h"
00701 },
00702 STANDARD_VTABLE(little2_)
00703 };
00704
00705 #endif
00706
00707 static const struct normal_encoding internal_little2_encoding = {
00708 { VTABLE, 2, 0, 1 },
00709 {
00710 #define BT_COLON BT_NMSTRT
00711 #include "iasciitab.h"
00712 #undef BT_COLON
00713 #include "latin1tab.h"
00714 },
00715 STANDARD_VTABLE(little2_)
00716 };
00717
00718 #endif
00719
00720
00721 #define BIG2_BYTE_TYPE(enc, p) \
00722 ((p)[0] == 0 \
00723 ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00724 : unicode_byte_type((p)[0], (p)[1]))
00725 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00726 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00727 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00728 UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00729 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00730 UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00731
00732 #ifdef XML_MIN_SIZE
00733
00734 static
00735 int big2_byteType(const ENCODING *enc, const char *p)
00736 {
00737 return BIG2_BYTE_TYPE(enc, p);
00738 }
00739
00740 static
00741 int big2_byteToAscii(const ENCODING *enc, const char *p)
00742 {
00743 return BIG2_BYTE_TO_ASCII(enc, p);
00744 }
00745
00746 static
00747 int big2_charMatches(const ENCODING *enc, const char *p, int c)
00748 {
00749 return BIG2_CHAR_MATCHES(enc, p, c);
00750 }
00751
00752 static
00753 int big2_isNameMin(const ENCODING *enc, const char *p)
00754 {
00755 return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00756 }
00757
00758 static
00759 int big2_isNmstrtMin(const ENCODING *enc, const char *p)
00760 {
00761 return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00762 }
00763
00764 #undef VTABLE
00765 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00766
00767 #else
00768
00769 #undef PREFIX
00770 #define PREFIX(ident) big2_ ## ident
00771 #define MINBPC(enc) 2
00772
00773 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00774 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00775 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00776 #define IS_NAME_CHAR(enc, p, n) 0
00777 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00778 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00779 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00780
00781 #include "xmltok_impl.c"
00782
00783 #undef MINBPC
00784 #undef BYTE_TYPE
00785 #undef BYTE_TO_ASCII
00786 #undef CHAR_MATCHES
00787 #undef IS_NAME_CHAR
00788 #undef IS_NAME_CHAR_MINBPC
00789 #undef IS_NMSTRT_CHAR
00790 #undef IS_NMSTRT_CHAR_MINBPC
00791 #undef IS_INVALID_CHAR
00792
00793 #endif
00794
00795 #ifdef XML_NS
00796
00797 static const struct normal_encoding big2_encoding_ns = {
00798 { VTABLE, 2, 0,
00799 #if XML_BYTE_ORDER == 21
00800 1
00801 #else
00802 0
00803 #endif
00804 },
00805 {
00806 #include "asciitab.h"
00807 #include "latin1tab.h"
00808 },
00809 STANDARD_VTABLE(big2_)
00810 };
00811
00812 #endif
00813
00814 static const struct normal_encoding big2_encoding = {
00815 { VTABLE, 2, 0,
00816 #if XML_BYTE_ORDER == 21
00817 1
00818 #else
00819 0
00820 #endif
00821 },
00822 {
00823 #define BT_COLON BT_NMSTRT
00824 #include "asciitab.h"
00825 #undef BT_COLON
00826 #include "latin1tab.h"
00827 },
00828 STANDARD_VTABLE(big2_)
00829 };
00830
00831 #if XML_BYTE_ORDER != 12
00832
00833 #ifdef XML_NS
00834
00835 static const struct normal_encoding internal_big2_encoding_ns = {
00836 { VTABLE, 2, 0, 1 },
00837 {
00838 #include "iasciitab.h"
00839 #include "latin1tab.h"
00840 },
00841 STANDARD_VTABLE(big2_)
00842 };
00843
00844 #endif
00845
00846 static const struct normal_encoding internal_big2_encoding = {
00847 { VTABLE, 2, 0, 1 },
00848 {
00849 #define BT_COLON BT_NMSTRT
00850 #include "iasciitab.h"
00851 #undef BT_COLON
00852 #include "latin1tab.h"
00853 },
00854 STANDARD_VTABLE(big2_)
00855 };
00856
00857 #endif
00858
00859 #undef PREFIX
00860
00861 static
00862 int streqci(const char *s1, const char *s2)
00863 {
00864 for (;;) {
00865 char c1 = *s1++;
00866 char c2 = *s2++;
00867 if (ASCII_a <= c1 && c1 <= ASCII_z)
00868 c1 += ASCII_A - ASCII_a;
00869 if (ASCII_a <= c2 && c2 <= ASCII_z)
00870 c2 += ASCII_A - ASCII_a;
00871 if (c1 != c2)
00872 return 0;
00873 if (!c1)
00874 break;
00875 }
00876 return 1;
00877 }
00878
00879 static
00880 void initUpdatePosition(const ENCODING *enc, const char *ptr,
00881 const char *end, POSITION *pos)
00882 {
00883 normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00884 }
00885
00886 static
00887 int toAscii(const ENCODING *enc, const char *ptr, const char *end)
00888 {
00889 char buf[1];
00890 char *p = buf;
00891 XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00892 if (p == buf)
00893 return -1;
00894 else
00895 return buf[0];
00896 }
00897
00898 static
00899 int isSpace(int c)
00900 {
00901 switch (c) {
00902 case 0x20:
00903 case 0xD:
00904 case 0xA:
00905 case 0x9:
00906 return 1;
00907 }
00908 return 0;
00909 }
00910
00911
00912
00913 static
00914 int parsePseudoAttribute(const ENCODING *enc,
00915 const char *ptr,
00916 const char *end,
00917 const char **namePtr,
00918 const char **nameEndPtr,
00919 const char **valPtr,
00920 const char **nextTokPtr)
00921 {
00922 int c;
00923 char open;
00924 if (ptr == end) {
00925 *namePtr = 0;
00926 return 1;
00927 }
00928 if (!isSpace(toAscii(enc, ptr, end))) {
00929 *nextTokPtr = ptr;
00930 return 0;
00931 }
00932 do {
00933 ptr += enc->minBytesPerChar;
00934 } while (isSpace(toAscii(enc, ptr, end)));
00935 if (ptr == end) {
00936 *namePtr = 0;
00937 return 1;
00938 }
00939 *namePtr = ptr;
00940 for (;;) {
00941 c = toAscii(enc, ptr, end);
00942 if (c == -1) {
00943 *nextTokPtr = ptr;
00944 return 0;
00945 }
00946 if (c == ASCII_EQUALS) {
00947 *nameEndPtr = ptr;
00948 break;
00949 }
00950 if (isSpace(c)) {
00951 *nameEndPtr = ptr;
00952 do {
00953 ptr += enc->minBytesPerChar;
00954 } while (isSpace(c = toAscii(enc, ptr, end)));
00955 if (c != ASCII_EQUALS) {
00956 *nextTokPtr = ptr;
00957 return 0;
00958 }
00959 break;
00960 }
00961 ptr += enc->minBytesPerChar;
00962 }
00963 if (ptr == *namePtr) {
00964 *nextTokPtr = ptr;
00965 return 0;
00966 }
00967 ptr += enc->minBytesPerChar;
00968 c = toAscii(enc, ptr, end);
00969 while (isSpace(c)) {
00970 ptr += enc->minBytesPerChar;
00971 c = toAscii(enc, ptr, end);
00972 }
00973 if (c != ASCII_QUOT && c != ASCII_APOS) {
00974 *nextTokPtr = ptr;
00975 return 0;
00976 }
00977 open = c;
00978 ptr += enc->minBytesPerChar;
00979 *valPtr = ptr;
00980 for (;; ptr += enc->minBytesPerChar) {
00981 c = toAscii(enc, ptr, end);
00982 if (c == open)
00983 break;
00984 if (!(ASCII_a <= c && c <= ASCII_z)
00985 && !(ASCII_A <= c && c <= ASCII_Z)
00986 && !(ASCII_0 <= c && c <= ASCII_9)
00987 && c != ASCII_PERIOD
00988 && c != ASCII_MINUS
00989 && c != ASCII_UNDERSCORE) {
00990 *nextTokPtr = ptr;
00991 return 0;
00992 }
00993 }
00994 *nextTokPtr = ptr + enc->minBytesPerChar;
00995 return 1;
00996 }
00997
00998 static const char KW_version[] = {
00999 ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
01000 };
01001
01002 static const char KW_encoding[] = {
01003 ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
01004 };
01005
01006 static const char KW_standalone[] = {
01007 ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
01008 };
01009
01010 static const char KW_yes[] = {
01011 ASCII_y, ASCII_e, ASCII_s, '\0'
01012 };
01013
01014 static const char KW_no[] = {
01015 ASCII_n, ASCII_o, '\0'
01016 };
01017
01018 static
01019 int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01020 const char *,
01021 const char *),
01022 int isGeneralTextEntity,
01023 const ENCODING *enc,
01024 const char *ptr,
01025 const char *end,
01026 const char **badPtr,
01027 const char **versionPtr,
01028 const char **versionEndPtr,
01029 const char **encodingName,
01030 const ENCODING **encoding,
01031 int *standalone)
01032 {
01033 const char *val = 0;
01034 const char *name = 0;
01035 const char *nameEnd = 0;
01036 ptr += 5 * enc->minBytesPerChar;
01037 end -= 2 * enc->minBytesPerChar;
01038 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) {
01039 *badPtr = ptr;
01040 return 0;
01041 }
01042 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
01043 if (!isGeneralTextEntity) {
01044 *badPtr = name;
01045 return 0;
01046 }
01047 }
01048 else {
01049 if (versionPtr)
01050 *versionPtr = val;
01051 if (versionEndPtr)
01052 *versionEndPtr = ptr;
01053 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01054 *badPtr = ptr;
01055 return 0;
01056 }
01057 if (!name) {
01058 if (isGeneralTextEntity) {
01059
01060 *badPtr = ptr;
01061 return 0;
01062 }
01063 return 1;
01064 }
01065 }
01066 if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
01067 int c = toAscii(enc, val, end);
01068 if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
01069 *badPtr = val;
01070 return 0;
01071 }
01072 if (encodingName)
01073 *encodingName = val;
01074 if (encoding)
01075 *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01076 if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01077 *badPtr = ptr;
01078 return 0;
01079 }
01080 if (!name)
01081 return 1;
01082 }
01083 if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity) {
01084 *badPtr = name;
01085 return 0;
01086 }
01087 if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
01088 if (standalone)
01089 *standalone = 1;
01090 }
01091 else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
01092 if (standalone)
01093 *standalone = 0;
01094 }
01095 else {
01096 *badPtr = val;
01097 return 0;
01098 }
01099 while (isSpace(toAscii(enc, ptr, end)))
01100 ptr += enc->minBytesPerChar;
01101 if (ptr != end) {
01102 *badPtr = ptr;
01103 return 0;
01104 }
01105 return 1;
01106 }
01107
01108 static
01109 int checkCharRefNumber(int result)
01110 {
01111 switch (result >> 8) {
01112 case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01113 case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01114 return -1;
01115 case 0:
01116 if (latin1_encoding.type[result] == BT_NONXML)
01117 return -1;
01118 break;
01119 case 0xFF:
01120 if (result == 0xFFFE || result == 0xFFFF)
01121 return -1;
01122 break;
01123 }
01124 return result;
01125 }
01126
01127 int XmlUtf8Encode(int c, char *buf)
01128 {
01129 enum {
01130
01131 min2 = 0x80,
01132 min3 = 0x800,
01133 min4 = 0x10000
01134 };
01135
01136 if (c < 0)
01137 return 0;
01138 if (c < min2) {
01139 buf[0] = (c | UTF8_cval1);
01140 return 1;
01141 }
01142 if (c < min3) {
01143 buf[0] = ((c >> 6) | UTF8_cval2);
01144 buf[1] = ((c & 0x3f) | 0x80);
01145 return 2;
01146 }
01147 if (c < min4) {
01148 buf[0] = ((c >> 12) | UTF8_cval3);
01149 buf[1] = (((c >> 6) & 0x3f) | 0x80);
01150 buf[2] = ((c & 0x3f) | 0x80);
01151 return 3;
01152 }
01153 if (c < 0x110000) {
01154 buf[0] = ((c >> 18) | UTF8_cval4);
01155 buf[1] = (((c >> 12) & 0x3f) | 0x80);
01156 buf[2] = (((c >> 6) & 0x3f) | 0x80);
01157 buf[3] = ((c & 0x3f) | 0x80);
01158 return 4;
01159 }
01160 return 0;
01161 }
01162
01163 int XmlUtf16Encode(int charNum, unsigned short *buf)
01164 {
01165 if (charNum < 0)
01166 return 0;
01167 if (charNum < 0x10000) {
01168 buf[0] = charNum;
01169 return 1;
01170 }
01171 if (charNum < 0x110000) {
01172 charNum -= 0x10000;
01173 buf[0] = (charNum >> 10) + 0xD800;
01174 buf[1] = (charNum & 0x3FF) + 0xDC00;
01175 return 2;
01176 }
01177 return 0;
01178 }
01179
01180 struct unknown_encoding {
01181 struct normal_encoding normal;
01182 int (*convert)(void *userData, const char *p);
01183 void *userData;
01184 unsigned short utf16[256];
01185 char utf8[256][4];
01186 };
01187
01188 int XmlSizeOfUnknownEncoding(void)
01189 {
01190 return sizeof(struct unknown_encoding);
01191 }
01192
01193 static
01194 int unknown_isName(const ENCODING *enc, const char *p)
01195 {
01196 int c = ((const struct unknown_encoding *)enc)
01197 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01198 if (c & ~0xFFFF)
01199 return 0;
01200 return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01201 }
01202
01203 static
01204 int unknown_isNmstrt(const ENCODING *enc, const char *p)
01205 {
01206 int c = ((const struct unknown_encoding *)enc)
01207 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01208 if (c & ~0xFFFF)
01209 return 0;
01210 return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01211 }
01212
01213 static
01214 int unknown_isInvalid(const ENCODING *enc, const char *p)
01215 {
01216 int c = ((const struct unknown_encoding *)enc)
01217 ->convert(((const struct unknown_encoding *)enc)->userData, p);
01218 return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01219 }
01220
01221 static
01222 void unknown_toUtf8(const ENCODING *enc,
01223 const char **fromP, const char *fromLim,
01224 char **toP, const char *toLim)
01225 {
01226 char buf[XML_UTF8_ENCODE_MAX];
01227 for (;;) {
01228 const char *utf8;
01229 int n;
01230 if (*fromP == fromLim)
01231 break;
01232 utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
01233 n = *utf8++;
01234 if (n == 0) {
01235 int c = ((const struct unknown_encoding *)enc)
01236 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01237 n = XmlUtf8Encode(c, buf);
01238 if (n > toLim - *toP)
01239 break;
01240 utf8 = buf;
01241 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01242 - (BT_LEAD2 - 2);
01243 }
01244 else {
01245 if (n > toLim - *toP)
01246 break;
01247 (*fromP)++;
01248 }
01249 do {
01250 *(*toP)++ = *utf8++;
01251 } while (--n != 0);
01252 }
01253 }
01254
01255 static
01256 void unknown_toUtf16(const ENCODING *enc,
01257 const char **fromP, const char *fromLim,
01258 unsigned short **toP, const unsigned short *toLim)
01259 {
01260 while (*fromP != fromLim && *toP != toLim) {
01261 unsigned short c
01262 = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
01263 if (c == 0) {
01264 c = (unsigned short)((const struct unknown_encoding *)enc)
01265 ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
01266 *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
01267 - (BT_LEAD2 - 2);
01268 }
01269 else
01270 (*fromP)++;
01271 *(*toP)++ = c;
01272 }
01273 }
01274
01275 ENCODING *
01276 XmlInitUnknownEncoding(void *mem,
01277 int *table,
01278 int (*convert)(void *userData, const char *p),
01279 void *userData)
01280 {
01281 int i;
01282 struct unknown_encoding *e = mem;
01283 for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
01284 ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01285 for (i = 0; i < 128; i++)
01286 if (latin1_encoding.type[i] != BT_OTHER
01287 && latin1_encoding.type[i] != BT_NONXML
01288 && table[i] != i)
01289 return 0;
01290 for (i = 0; i < 256; i++) {
01291 int c = table[i];
01292 if (c == -1) {
01293 e->normal.type[i] = BT_MALFORM;
01294
01295 e->utf16[i] = 0xFFFF;
01296 e->utf8[i][0] = 1;
01297 e->utf8[i][1] = 0;
01298 }
01299 else if (c < 0) {
01300 if (c < -4)
01301 return 0;
01302 e->normal.type[i] = BT_LEAD2 - (c + 2);
01303 e->utf8[i][0] = 0;
01304 e->utf16[i] = 0;
01305 }
01306 else if (c < 0x80) {
01307 if (latin1_encoding.type[c] != BT_OTHER
01308 && latin1_encoding.type[c] != BT_NONXML
01309 && c != i)
01310 return 0;
01311 e->normal.type[i] = latin1_encoding.type[c];
01312 e->utf8[i][0] = 1;
01313 e->utf8[i][1] = (char)c;
01314 e->utf16[i] = c == 0 ? 0xFFFF : c;
01315 }
01316 else if (checkCharRefNumber(c) < 0) {
01317 e->normal.type[i] = BT_NONXML;
01318
01319 e->utf16[i] = 0xFFFF;
01320 e->utf8[i][0] = 1;
01321 e->utf8[i][1] = 0;
01322 }
01323 else {
01324 if (c > 0xFFFF)
01325 return 0;
01326 if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01327 e->normal.type[i] = BT_NMSTRT;
01328 else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01329 e->normal.type[i] = BT_NAME;
01330 else
01331 e->normal.type[i] = BT_OTHER;
01332 e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01333 e->utf16[i] = c;
01334 }
01335 }
01336 e->userData = userData;
01337 e->convert = convert;
01338 if (convert) {
01339 e->normal.isName2 = unknown_isName;
01340 e->normal.isName3 = unknown_isName;
01341 e->normal.isName4 = unknown_isName;
01342 e->normal.isNmstrt2 = unknown_isNmstrt;
01343 e->normal.isNmstrt3 = unknown_isNmstrt;
01344 e->normal.isNmstrt4 = unknown_isNmstrt;
01345 e->normal.isInvalid2 = unknown_isInvalid;
01346 e->normal.isInvalid3 = unknown_isInvalid;
01347 e->normal.isInvalid4 = unknown_isInvalid;
01348 }
01349 e->normal.enc.utf8Convert = unknown_toUtf8;
01350 e->normal.enc.utf16Convert = unknown_toUtf16;
01351 return &(e->normal.enc);
01352 }
01353
01354
01355
01356 enum {
01357 UNKNOWN_ENC = -1,
01358 ISO_8859_1_ENC = 0,
01359 US_ASCII_ENC,
01360 UTF_8_ENC,
01361 UTF_16_ENC,
01362 UTF_16BE_ENC,
01363 UTF_16LE_ENC,
01364
01365 NO_ENC
01366 };
01367
01368 static const char KW_ISO_8859_1[] = {
01369 ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
01370 };
01371 static const char KW_US_ASCII[] = {
01372 ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
01373 };
01374 static const char KW_UTF_8[] = {
01375 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
01376 };
01377 static const char KW_UTF_16[] = {
01378 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
01379 };
01380 static const char KW_UTF_16BE[] = {
01381 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
01382 };
01383 static const char KW_UTF_16LE[] = {
01384 ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
01385 };
01386
01387 static
01388 int getEncodingIndex(const char *name)
01389 {
01390 static const char *encodingNames[] = {
01391 KW_ISO_8859_1,
01392 KW_US_ASCII,
01393 KW_UTF_8,
01394 KW_UTF_16,
01395 KW_UTF_16BE,
01396 KW_UTF_16LE,
01397 };
01398 int i;
01399 if (name == 0)
01400 return NO_ENC;
01401 for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
01402 if (streqci(name, encodingNames[i]))
01403 return i;
01404 return UNKNOWN_ENC;
01405 }
01406
01407
01408
01409
01410 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
01411 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
01412
01413
01414
01415
01416
01417
01418
01419
01420
01421 static
01422 int initScan(const ENCODING **encodingTable,
01423 const INIT_ENCODING *enc,
01424 int state,
01425 const char *ptr,
01426 const char *end,
01427 const char **nextTokPtr)
01428 {
01429 const ENCODING **encPtr;
01430
01431 if (ptr == end)
01432 return XML_TOK_NONE;
01433 encPtr = enc->encPtr;
01434 if (ptr + 1 == end) {
01435
01436 #ifndef XML_DTD
01437
01438 if (state != XML_CONTENT_STATE)
01439 return XML_TOK_PARTIAL;
01440 #endif
01441
01442
01443 switch (INIT_ENC_INDEX(enc)) {
01444 case UTF_16_ENC:
01445 case UTF_16LE_ENC:
01446 case UTF_16BE_ENC:
01447 return XML_TOK_PARTIAL;
01448 }
01449 switch ((unsigned char)*ptr) {
01450 case 0xFE:
01451 case 0xFF:
01452 case 0xEF:
01453 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01454 && state == XML_CONTENT_STATE)
01455 break;
01456
01457 case 0x00:
01458 case 0x3C:
01459 return XML_TOK_PARTIAL;
01460 }
01461 }
01462 else {
01463 switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01464 case 0xFEFF:
01465 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01466 && state == XML_CONTENT_STATE)
01467 break;
01468 *nextTokPtr = ptr + 2;
01469 *encPtr = encodingTable[UTF_16BE_ENC];
01470 return XML_TOK_BOM;
01471
01472 case 0x3C00:
01473 if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01474 || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01475 && state == XML_CONTENT_STATE)
01476 break;
01477 *encPtr = encodingTable[UTF_16LE_ENC];
01478 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01479 case 0xFFFE:
01480 if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01481 && state == XML_CONTENT_STATE)
01482 break;
01483 *nextTokPtr = ptr + 2;
01484 *encPtr = encodingTable[UTF_16LE_ENC];
01485 return XML_TOK_BOM;
01486 case 0xEFBB:
01487
01488
01489
01490
01491
01492
01493 if (state == XML_CONTENT_STATE) {
01494 int e = INIT_ENC_INDEX(enc);
01495 if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
01496 break;
01497 }
01498 if (ptr + 2 == end)
01499 return XML_TOK_PARTIAL;
01500 if ((unsigned char)ptr[2] == 0xBF) {
01501 *encPtr = encodingTable[UTF_8_ENC];
01502 return XML_TOK_BOM;
01503 }
01504 break;
01505 default:
01506 if (ptr[0] == '\0') {
01507
01508
01509
01510
01511 if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01512 break;
01513 *encPtr = encodingTable[UTF_16BE_ENC];
01514 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01515 }
01516 else if (ptr[1] == '\0') {
01517
01518
01519
01520
01521
01522
01523
01524
01525 if (state == XML_CONTENT_STATE)
01526 break;
01527 *encPtr = encodingTable[UTF_16LE_ENC];
01528 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01529 }
01530 break;
01531 }
01532 }
01533 *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01534 return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01535 }
01536
01537
01538 #define NS(x) x
01539 #define ns(x) x
01540 #include "xmltok_ns.c"
01541 #undef NS
01542 #undef ns
01543
01544 #ifdef XML_NS
01545
01546 #define NS(x) x ## NS
01547 #define ns(x) x ## _ns
01548
01549 #include "xmltok_ns.c"
01550
01551 #undef NS
01552 #undef ns
01553
01554 ENCODING *
01555 XmlInitUnknownEncodingNS(void *mem,
01556 int *table,
01557 int (*convert)(void *userData, const char *p),
01558 void *userData)
01559 {
01560 ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01561 if (enc)
01562 ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
01563 return enc;
01564 }
01565
01566 #endif