1 mike 1.1 /*
2 **==============================================================================
3 **
4 ** Open Management Infrastructure (OMI)
5 **
6 ** Copyright (c) Microsoft Corporation
7 **
8 ** Licensed under the Apache License, Version 2.0 (the "License"); you may not
9 ** use this file except in compliance with the License. You may obtain a copy
10 ** of the License at
11 **
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 **
14 ** THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 ** KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
16 ** WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
17 ** MERCHANTABLITY OR NON-INFRINGEMENT.
18 **
19 ** See the Apache 2 License for the specific language governing permissions
20 ** and limitations under the License.
21 **
22 mike 1.1 **==============================================================================
23 */
24
25 #include <common.h>
26 #include "xml.h"
27 #include <string.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <stdarg.h>
31 #include <ctype.h>
32 #include <base/io.h>
33
34 #if defined(_MSC_VER)
35 /* PreFast - reviewed and believed to be false-positive*/
36
37 /* warning C6385: Invalid data: accessing '??', the readable size is 'x' bytes, but 'y' bytes might be read: Lines: ... */
38 # pragma warning(disable : 6385)
39 /* warning C6386: Buffer overrun: accessing 'self->registeredNameSpaces' ... */
40 # pragma warning(disable : 6386)
41
42 #endif /* _MSC_VER */
43 mike 1.1
44 /*
45 **==============================================================================
46 **
47 ** Local definitions
48 **
49 **==============================================================================
50 */
51
52 typedef enum _XML_State
53 {
54 STATE_START,
55 STATE_TAG,
56 STATE_CHARS,
57 }
58 XML_State;
59
60 /* Space characters include [\n\t\r ]
61 * _spaceChar['\n'] => 1
62 * _spaceChar['\r'] => 2
63 * _spaceChar['\t'] => 2
64 mike 1.1 * _spaceChar[' '] => 2
65 */
66 static unsigned char _spaceChar[256] =
67 {
68 0,0,0,0,0,0,0,0,0,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
69 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
70 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
71 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
72 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
73 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
74 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
75 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
76 };
77
78 INLINE int _IsSpace(char c)
79 {
80 return _spaceChar[(unsigned int)c];
81 }
82
83 /* Matches XML name characters of the form: [A-Za-z_][A-Za-z0-9_-.:]*
84 * _nameChar[A-Za-z_] => 2 (first character)
85 mike 1.1 * _nameChar[A-Za-z0-9_-.:] => 1 or 2 (inner character)
86 */
87 static unsigned char _nameChar[256] =
88 {
89 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
90 0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
91 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,2,
92 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,0,0,0,0,
93 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
94 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
95 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
96 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
97 };
98
99 INLINE int _IsFirst(char c)
100 {
101 return _nameChar[(unsigned int)c] & 2;
102 }
103
104 INLINE int _IsInner(char c)
105 {
106 mike 1.1 return _nameChar[(unsigned char)c];
107 }
108
109 INLINE char* _SkipInner(char* p)
110 {
111 while (_IsInner(*p))
112 p++;
113
114 return p;
115 }
116
117 static char* _SkipSpacesAux(XML* self, char* p)
118 {
119 unsigned char x;
120 size_t n = 0;
121
122 while ((x = (unsigned char)_IsSpace(*p)) != 0)
123 {
124 n += 0x01 & x;
125 p++;
126 }
127 mike 1.1
128 self->line += n;
129 return p;
130 }
131
132 INLINE char* _SkipSpaces(XML* self, char* p)
133 {
134 if (!_IsSpace(p[0]))
135 return p;
136
137 if (p[0] == '\n')
138 self->line++;
139
140 if (!_IsSpace(p[1]))
141 return &p[1];
142
143 if (p[1] == '\n')
144 self->line++;
145
146 if (!_IsSpace(p[2]))
147 return &p[2];
148 mike 1.1
149 if (p[2] == '\n')
150 self->line++;
151
152 return _SkipSpacesAux(self, &p[3]);
153 }
154
155 INLINE char* _ToEntityRef(XML* self, char* p, char* ch)
156 {
157 /* Note: we collected the following statistics on the frequency of
158 * each entity reference in a large body of XML documents:
159 *
160 * " - 74,480 occurences
161 * ' - 13,877 occurences
162 * < - 9,919 occurences
163 * > - 9,853 occurences
164 * & - 111 occurences
165 *
166 * The cases below are organized in order of statistical frequency.
167 */
168
169 mike 1.1 /* Match one of these: "lt;", "gt;", "amp;", "quot;", "apos;" */
170
171 if (p[0] == 'q' && p[1] == 'u' && p[2] == 'o' && p[3] == 't' && p[4] == ';')
172 {
173 *ch = '"';
174 return p + 5;
175 }
176
177 if (p[0] == 'a' && p[1] == 'p' && p[2] == 'o' && p[3] == 's' && p[4] == ';')
178 {
179 *ch = '\'';
180 return p + 5;
181 }
182
183 if (p[0] == 'l' && p[1] == 't' && p[2] == ';')
184 {
185 *ch = '<';
186 return p + 3;
187 }
188
189 if (p[0] == 'g' && p[1] == 't' && p[2] == ';')
190 mike 1.1 {
191 *ch = '>';
192 return p + 3;
193 }
194
195 if (p[0] == 'a' && p[1] == 'm' && p[2] == 'p' && p[3] == ';')
196 {
197 *ch = '&';
198 return p + 4;
199 }
200
201 XML_Raise(self, "bad entity reference");
202 return p;
203 }
204
205 INLINE char* _ToCharRef(XML* self, char* p, char* ch)
206 {
207 char* end;
208 unsigned long x;
209
210 if (*p == 'x')
211 mike 1.1 {
212 p++;
213 x = strtoul(p, &end, 16);
214 }
215 else
216 {
217 x = strtoul(p, &end, 10);
218 }
219
220 if (end == p || *end != ';' || x > 255)
221 {
222 *ch = '\0';
223 XML_Raise(self, "bad character reference");
224 return p;
225 }
226
227 *ch = (char)x;
228
229 return end + 1;
230 }
231
232 mike 1.1 INLINE char* _ToRef(XML* self, char* p, char* ch)
233 {
234 /* Examples:
235 * @
236 * &xFF;
237 * &
238 * <
239 */
240 if (*p == '#')
241 return _ToCharRef(self, p + 1, ch);
242 else
243 return _ToEntityRef(self, p, ch);
244 }
245
246 /* Reduce entity references and remove leading and trailing whitespace */
247 static char* _ReduceAttrValue(XML* self, char** pInOut, char eos)
248 {
249 /* Matches all but '\0', '\'', '"', and '&'. All matching charcters
250 * yeild 2, except for '\n', which yields 1
251 */
252 static unsigned char _match[256] =
253 mike 1.1 {
254 0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
255 1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
256 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
257 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
258 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
259 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
260 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
261 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
262 };
263 char* p = *pInOut;
264 char* end = p;
265 size_t n = 0;
266
267 /* Skip uninteresting characters */
268 for (;;)
269 {
270 while (_match[(unsigned char)*p])
271 p++;
272
273 if (*p != '\n')
274 mike 1.1 break;
275
276 self->line++;
277 p++;
278 }
279
280 end = p;
281
282 while (*p && *p != eos)
283 {
284 if (*p == '&')
285 {
286 char c = '\0';
287 char* tmp;
288
289 p++;
290 tmp = _ToRef(self, p, &c);
291
292 if (self->status)
293 {
294 /* Propagate error */
295 mike 1.1 return NULL;
296 }
297
298 *end++ = c;
299 p = tmp;
300 }
301 else
302 {
303 if (*p == '\n')
304 n++;
305
306 *end++ = *p++;
307 }
308 }
309
310 *pInOut = p;
311 self->line += n;
312
313 return end;
314 }
315
316 mike 1.1 /* Reduce character data, advance p, and return pointer to end */
317 static char* _ReduceCharData(XML* self, char** pInOut)
318 {
319 /* Match all but these: '\0', '<', '&', '\n' */
320 static unsigned char _match[256] =
321 {
322 0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
323 1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,
324 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
325 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
326 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
327 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
328 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
329 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
330 };
331 char* p = *pInOut;
332 char* end = p;
333 size_t n = 0;
334
335 for (;;)
336 {
337 mike 1.1 while ((_match[(unsigned char)*p]))
338 p++;
339
340 if (*p != '\n')
341 break;
342
343 p++;
344 self->line++;
345 }
346
347 end = p;
348
349 /* Can we return now? */
350 if (*p == '<')
351 {
352 *pInOut = p;
353 self->line += n;
354 return end;
355 }
356
357 /* Seek next tag start */
358 mike 1.1 while (*p && *p != '<')
359 {
360 if (*p == '&')
361 {
362 char c = '\0';
363 char* tmp;
364
365 p++;
366 tmp = _ToRef(self, p, &c);
367
368 if (tmp == p)
369 return NULL;
370
371 *end++ = c;
372 p = tmp;
373 }
374 else
375 {
376 for (;;)
377 {
378 while (_match[(unsigned char)*p])
379 mike 1.1 *end++ = *p++;
380
381 if (*p != '\n')
382 break;
383
384 *end++ = *p++;
385 self->line++;
386 }
387 }
388 }
389
390 /* Document cannot end with character data */
391 if (*p == '\0')
392 return NULL;
393
394 *pInOut = p;
395 self->line += n;
396
397 return end;
398 }
399
400 mike 1.1 /* Calculate a fast hash code for a non-zero-length strings */
401 INLINE unsigned int _HashCode(const char* s, size_t n)
402 {
403 /* This hash algorithm excludes the first character since for many strings
404 * (e.g., URIs) the first character is not unique. Instead the hash
405 * comprises three components:
406 * (1) The length
407 * (3) The last chacter
408 */
409 return (int)(n ^ s[n-1]);
410 }
411
412 /* Map a URI to a single character namespace identifier */
413 static char _FindNamespaceID(
414 XML* self,
415 const char* uri,
416 size_t uriSize)
417 {
418 size_t i;
419 unsigned int code = _HashCode(uri, uriSize);
420
421 mike 1.1 /* Resolve from client namespace registrations */
422 for (i = 0; i < self->registeredNameSpacesSize; i++)
423 {
424 XML_RegisteredNameSpace* rns = &self->registeredNameSpaces[i];
425
426 if (rns->uriCode == code && strcmp(rns->uri, uri) == 0)
427 return rns->id;
428 }
429
430 /* Not found so return null id */
431 return '\0';
432 }
433
434 /* Translate the namespace name used in the document to a single-character
435 * namespace name specified by the client in the XML_RegisterNameSpace() call.
436 * For example: "wsman:OptimizeEnumeration" => "w:OptimizeEnumeration".
437 */
438 static char* _TranslateName(XML* self, char* name, char* colon)
439 {
440 unsigned int code;
441 size_t i;
442 mike 1.1
443 /* Temporarily zero-out the ':' character */
444 *colon = '\0';
445
446 /* Calculate hash code */
447 code = _HashCode(name, colon - name);
448
449 /* First check single entry cache */
450 if (self->nameSpacesCacheIndex != (size_t)-1)
451 {
452 XML_NameSpace* ns = &self->nameSpaces[self->nameSpacesCacheIndex];
453
454 if (ns->nameCode == code && strcmp(ns->name, name) == 0)
455 {
456 if (ns->id)
457 {
458 colon[-1] = ns->id;
459 *colon = ':';
460 return colon - 1;
461 }
462 else
463 mike 1.1 {
464 *colon = ':';
465 return name;
466 }
467 }
468 }
469
470 /* Translate name to the one found in the nameSpaces[] array */
471 for (i = self->nameSpacesSize; i--; )
472 {
473 XML_NameSpace* ns = &self->nameSpaces[i];
474
475 if (ns->nameCode == code && strcmp(ns->name, name) == 0)
476 {
477 /* Cache */
478 self->nameSpacesCacheIndex = i;
479
480 if (ns->id)
481 {
482 colon[-1] = ns->id;
483 *colon = ':';
484 mike 1.1 return colon - 1;
485 }
486 else
487 {
488 *colon = ':';
489 return name;
490 }
491 }
492 }
493
494 /* Restore the ':' character */
495 *colon = ':';
496 return name;
497 }
498
499 static void _ParseAttr(
500 XML* self,
501 XML_Elem* elem,
502 char** pInOut)
503 {
504 char* p = *pInOut;
505 mike 1.1 char* name;
506 char* nameEnd;
507 char* value;
508 char* valueEnd;
509 char* colon = NULL;
510
511 /* Parse the attribute name */
512 {
513 name = p;
514
515 if (!_IsFirst(*p))
516 {
517 XML_Raise(self, "expected attribute name");
518 return;
519 }
520
521 p++;
522
523 p = _SkipInner(p);
524
525 if (*p == ':')
526 mike 1.1 {
527 colon = p++;
528 p = _SkipInner(p);
529 }
530
531 nameEnd = p;
532 }
533
534 /* Seek the quote character (position p beyond quote) */
535 {
536 /* Skip spaces */
537 p = _SkipSpaces(self, p);
538
539 /* Expect a '=' character */
540 if (*p++ != '=')
541 {
542 XML_Raise(self, "expected '=' character");
543 return;
544 }
545 }
546
547 mike 1.1 /* Null-terminate name now that we are beyond the '=' */
548 *nameEnd = '\0';
549
550 /* Skip spaces */
551 p = _SkipSpaces(self, p);
552
553 /* Parse the value */
554 {
555 char quote;
556
557 /* Expect opening quote */
558 if (*p != '"' && *p != '\'')
559 {
560 XML_Raise(self, "expected opening quote");
561 return;
562 }
563
564 quote = *p++;
565 value = p;
566
567 valueEnd = _ReduceAttrValue(self, &p, quote);
568 mike 1.1
569 if (self->status)
570 {
571 /* Propagate error */
572 return;
573 }
574
575 /* Expect closing quote */
576 if (*p++ != quote)
577 {
578 XML_Raise(self, "expected closing quote");
579 return;
580 }
581
582 /* Null-terminate the value */
583 *valueEnd = '\0';
584 }
585
586 /* Skip spaces */
587 p = _SkipSpaces(self, p);
588
589 mike 1.1 /* If attribute name is "xmlns", extract namespace */
590 if (name[0] == 'x' &&
591 name[1] == 'm' &&
592 name[2] == 'l' &&
593 name[3] == 'n' &&
594 name[4] == 's')
595 {
596 /* ATTN: implement default namespaces */
597 if (name[5] != ':')
598 {
599 XML_Raise(self, "default namespaces not supported: xmlns");
600 return;
601 }
602
603 /* Add new namespace entry */
604 {
605 /* Check for stack overflow */
606 if (self->nameSpacesSize == XML_MAX_NAMESPACES)
607 {
608 XML_Raise(self, "too many namespaces (>%u)",
609 (int)XML_MAX_NAMESPACES);
610 mike 1.1 return;
611 }
612 {
613 XML_NameSpace* ns = &self->nameSpaces[self->nameSpacesSize++];
614 ns->name = &name[6];
615 ns->nameCode = _HashCode(ns->name, nameEnd - ns->name);
616 ns->id = _FindNamespaceID(self, value, valueEnd - value);
617 ns->uri = value;
618 ns->depth = self->stackSize;
619 }
620 }
621 }
622 else
623 {
624 /* Translate the name (possibly replacing namespace with single char) */
625 if (colon)
626 name = _TranslateName(self, name, colon);
627 }
628
629 /* Append attribute to element */
630 {
631 mike 1.1 XML_Attr* attr;
632
633 /* Check for attribute array overflow */
634 if (elem->attrsSize == XML_MAX_ATTRIBUTES)
635 {
636 XML_Raise(self, "too many attributes (>%u)", XML_MAX_ATTRIBUTES);
637 return;
638 }
639
640 attr = &elem->attrs[elem->attrsSize++];
641 attr->name = name;
642 attr->value = value;
643 }
644
645 *pInOut = p;
646 }
647
648 static void _ParseProcessingInstruction(
649 XML* self,
650 XML_Elem* elem,
651 char* p)
652 mike 1.1 {
653 /* <?xml version="1.0" encoding="UTF-8" standalone="yes"?> */
654 char* start;
655 char* end;
656
657 /* Advance past '?' character */
658 p++;
659
660 /* Get tag identifier */
661 {
662 start = p;
663
664 p = _SkipInner(p);
665
666 if (*p == ':')
667 {
668 p++;
669 p = _SkipInner(p);
670 }
671
672 /* If input exhuasted */
673 mike 1.1 if (*p == '\0')
674 {
675 XML_Raise(self, "premature end of input");
676 return;
677 }
678
679 end = p;
680 }
681
682 /* Skip spaces */
683 p = _SkipSpaces(self, p);
684
685 /* Process attributes */
686
687 while (*p && *p != '?')
688 {
689 _ParseAttr(self, elem, &p);
690
691 if (self->status)
692 {
693 /* Propagate error */
694 mike 1.1 return;
695 }
696 }
697
698 p++;
699
700 /* Skip spaces */
701 p = _SkipSpaces(self, p);
702
703 /* Expect '>' */
704 if (*p++ != '>')
705 {
706 XML_Raise(self, "expected closing angle bracket");
707 return;
708 }
709
710 /* Return element object */
711 elem->type = XML_INSTRUCTION;
712 elem->data = start;
713 elem->size = end - start;
714 *end = '\0';
715 mike 1.1
716 self->ptr = p;
717
718 if (self->foundRoot)
719 self->state = STATE_CHARS;
720 else
721 self->state = STATE_START;
722 }
723
724 static void _ParseStartTag(
725 XML* self,
726 XML_Elem* elem,
727 char* p)
728 {
729 char* name;
730 char* nameEnd;
731 char* colon = NULL;
732
733 /* Found the root */
734 self->foundRoot = 1;
735
736 mike 1.1 /* Get tag identifier */
737 {
738 name = p;
739
740 if (!_IsFirst(*p++))
741 {
742 XML_Raise(self, "expected element name");
743 return;
744 }
745
746 p = _SkipInner(p);
747
748 if (*p == ':')
749 {
750 colon = p++;
751 p = _SkipInner(p);
752 }
753
754 /* If input exhuasted */
755 if (*p == '\0')
756 {
757 mike 1.1 XML_Raise(self, "premature end of input");
758 return;
759 }
760
761 nameEnd = p;
762 }
763
764 /* Skip spaces */
765 p = _SkipSpaces(self, p);
766
767 /* Process attributes */
768 while (*p && *p != '/' && *p != '>')
769 {
770 _ParseAttr(self, elem, &p);
771
772 if (self->status)
773 return;
774 }
775
776 /* Check for empty tag */
777 if (*p == '/')
778 mike 1.1 {
779 p++;
780
781 /* Null-terminate the tag */
782 *nameEnd = '\0';
783
784 /* Translate tag name (possibly replacing namespace with single char */
785 if (colon)
786 name = _TranslateName(self, name, colon);
787
788 /* Create the element */
789 elem->type = XML_START;
790 elem->data = name;
791 elem->size = nameEnd - name;
792
793 /* Inject an empty tag onto element stack */
794 {
795 /* Check for stack overflow */
796 if (self->elemStackSize == XML_MAX_NESTED)
797 {
798 XML_Raise(self, "element stack overflow (>%u)", XML_MAX_NESTED);
799 mike 1.1 return;
800 }
801
802 self->elemStack[self->elemStackSize] = *elem;
803 self->elemStack[self->elemStackSize].type = XML_END;
804 self->elemStackSize++;
805 self->nesting++;
806 }
807
808 /* Skip space */
809 p = _SkipSpaces(self, p);
810
811 /* Expect '>' */
812 if (*p++ != '>')
813 {
814 XML_Raise(self, "expected closing angle bracket");
815 return;
816 }
817
818 self->ptr = p;
819 self->state = STATE_CHARS;
820 mike 1.1 return;
821 }
822
823 /* Expect '>' */
824 if (*p++ != '>')
825 {
826 XML_Raise(self, "expected closing angle bracket");
827 return;
828 }
829
830 /* Zero-terminate the name tag */
831 *nameEnd = '\0';
832
833 /* Translate the namespace prefix */
834 if (colon)
835 name = _TranslateName(self, name, colon);
836
837 /* Push opening tag */
838 {
839 if (self->stackSize == XML_MAX_NESTED)
840 {
841 mike 1.1 XML_Raise(self, "element stack overflow (>%u)", XML_MAX_NESTED);
842 return;
843 }
844
845 self->stack[self->stackSize].data = name;
846 self->stack[self->stackSize].size = nameEnd - name;
847 self->stackSize++;
848 self->nesting++;
849 }
850
851 /* Return element object */
852 elem->type = XML_START;
853 elem->data = name;
854 elem->size = nameEnd - name;
855
856 self->ptr = p;
857
858 if (self->foundRoot)
859 self->state = STATE_CHARS;
860 else
861 self->state = STATE_START;
862 mike 1.1 }
863
864 static void _ParseEndTag(
865 XML* self,
866 XML_Elem* elem,
867 char* p)
868 {
869 /* Closing element: </name> */
870 char* name;
871 char* nameEnd;
872 char* colon = NULL;
873
874 p++;
875
876 /* Skip space */
877 p = _SkipSpaces(self, p);
878
879 name = p;
880
881 /* Skip name */
882 {
883 mike 1.1 if (!_IsFirst(*p++))
884 {
885 XML_Raise(self, "expected element name");
886 return;
887 }
888
889 p = _SkipInner(p);
890
891 if (*p == ':')
892 {
893 colon = p++;
894 p = _SkipInner(p);
895 }
896 }
897
898 /* If input exhuasted */
899 if (*p == '\0')
900 {
901 XML_Raise(self, "premature end of input");
902 return;
903 }
904 mike 1.1
905 nameEnd = p;
906
907 /* Skip spaces */
908 p = _SkipSpaces(self, p);
909
910 /* Expect '>' */
911 if (*p++ != '>')
912 {
913 XML_Raise(self, "expected closing angle bracket");
914 return;
915 }
916
917 /* Null terminate name */
918 *nameEnd = '\0';
919
920 /* Tranlate the namespace part of the name */
921 if (colon)
922 name = _TranslateName(self, name, colon);
923
924 /* Return element object */
925 mike 1.1 elem->type = XML_END;
926 elem->data = name;
927 elem->size = nameEnd - name;
928
929 /* Match opening name */
930 {
931 /* Check for stack underflow */
932 if (self->stackSize-- == 0)
933 {
934 XML_Raise(self, "too many closing tags: %s", name);
935 return;
936 }
937
938 self->nesting--;
939
940 /* Check that closing name matches opening name */
941 {
942 XML_Name* xn = &self->stack[self->stackSize];
943
944 if (memcmp(xn->data, name, xn->size) != 0)
945 {
946 mike 1.1 XML_Raise(self, "open/close tag mismatch: %s/%s",
947 self->stack[self->stackSize].data, name);
948 return;
949 }
950 }
951 }
952
953 /* Remove namespaces that have just gone out of scope */
954 {
955 size_t i;
956 size_t n = 0;
957
958 for (i = self->nameSpacesSize; i--; )
959 {
960 if (self->nameSpaces[i].depth >= self->stackSize)
961 n++;
962 }
963
964 if (n)
965 {
966 self->nameSpacesSize -= n;
967 mike 1.1
968 /* Clear single-entry cache */
969 if (self->nameSpacesCacheIndex >= self->nameSpacesSize)
970 self->nameSpacesCacheIndex = (size_t)-1;
971 }
972 }
973
974 /* Set next state */
975 self->ptr = p;
976 self->state = STATE_CHARS;
977 }
978
979 static void _ParseComment(
980 XML* self,
981 XML_Elem* elem,
982 char* p)
983 {
984 /* Comment: <!-- blah blah blah --> */
985 char* start;
986 char* end;
987
988 mike 1.1 p += 2;
989 start = p;
990
991 while (*p)
992 {
993 if (p[0] == '-' && p[1] == '-')
994 {
995 if (p[2] != '>')
996 {
997 XML_Raise(self, "expected closing comment");
998 return;
999 }
1000
1001 /* Null-terminate this comment */
1002 end = p;
1003 p += 3;
1004
1005 /* Prepare element */
1006 elem->type = XML_COMMENT;
1007 elem->data = start;
1008 elem->size = end - start;
1009 mike 1.1 *end = '\0';
1010
1011 /* Set next state */
1012 self->ptr = p;
1013
1014 if (self->foundRoot)
1015 self->state = STATE_CHARS;
1016 else
1017 self->state = STATE_START;
1018
1019 return;
1020 }
1021 else if (p[0] == '\n')
1022 self->line++;
1023
1024 p++;
1025 }
1026
1027 XML_Raise(self, "malformed comment");
1028 }
1029
1030 mike 1.1 static void _ParseCDATA(
1031 XML* self,
1032 XML_Elem* elem,
1033 char* p)
1034 {
1035 /* <![CDATA[...]]> */
1036 char* start;
1037 char* end;
1038
1039 p += 7;
1040 start = p;
1041
1042 while (*p)
1043 {
1044 if (p[0] == ']' && p[1] == ']' && p[2] == '>')
1045 {
1046 end = p;
1047 p += 3;
1048
1049 /* Prepare element */
1050 elem->type = XML_CHARS;
1051 mike 1.1 elem->data = start;
1052 elem->size = end - start;
1053 *end = '\0';
1054
1055 /* Set next state */
1056 self->ptr = p;
1057 self->state = STATE_CHARS;
1058
1059 return;
1060
1061 }
1062 else if (p[0] == '\n')
1063 self->line++;
1064
1065 p++;
1066 }
1067
1068 XML_Raise(self, "unterminated CDATA section");
1069 return;
1070 }
1071
1072 mike 1.1 static void _ParseDOCTYPE(
1073 XML* self,
1074 XML_Elem* elem,
1075 char* p)
1076 {
1077 MI_UNUSED(elem);
1078
1079 /* Recognize <!DOCTYPE ...> */
1080 p += 7;
1081
1082 while (*p && *p != '>')
1083 {
1084 if (p[0] == '\n')
1085 self->line++;
1086
1087 p++;
1088 }
1089
1090 if (*p++ != '>')
1091 {
1092 XML_Raise(self, "unterminated DOCTYPE element");
1093 mike 1.1 return;
1094 }
1095
1096 /* Set next state */
1097 self->ptr = p;
1098
1099 if (self->foundRoot)
1100 self->state = STATE_CHARS;
1101 else
1102 self->state = STATE_START;
1103 }
1104
1105 static int _ParseCharData(
1106 XML* self,
1107 XML_Elem* elem,
1108 char* p)
1109 {
1110 char* start;
1111 char* end;
1112
1113 /* Skip leading spaces */
1114 mike 1.1 p = _SkipSpaces(self, p);
1115
1116 /* Reject input if it does appear inside tags */
1117 if (self->stackSize == 0)
1118 {
1119 if (*p == '\0')
1120 {
1121 /* Proper end of input so set status to zero */
1122 self->status = 1;
1123 return 0;
1124 }
1125
1126 XML_Raise(self, "markup outside root element");
1127 return 0;
1128 }
1129
1130 /* Remove leading spaces */
1131 p = _SkipSpaces(self, p);
1132
1133 if (*p == '<')
1134 {
1135 mike 1.1 self->ptr = p + 1;
1136 self->state = STATE_TAG;
1137 return 0;
1138 }
1139
1140 /* Save pointer to start of data */
1141 start = p;
1142
1143 /* reduce character data */
1144 end = _ReduceCharData(self, &p);
1145
1146 if (self->status)
1147 {
1148 /* Propagate error */
1149 return 0;
1150 }
1151
1152 /* Process character data */
1153 if (*p != '<')
1154 {
1155 XML_Raise(self, "expected opening angle bracket");
1156 mike 1.1 return 0;
1157 }
1158
1159 /* Remove trailing spaces (the newlines have already been counted) */
1160 {
1161 /* Remove trainling spaces from the character data */
1162 start[-1] = '\0';
1163
1164 while (_IsSpace(end[-1]))
1165 end--;
1166 }
1167
1168 /* Set next state */
1169 self->ptr = p + 1;
1170 self->state = STATE_TAG;
1171
1172 /* Return character data element if non-empty */
1173 if (end == start)
1174 return 0;
1175
1176 /* Prepare element */
1177 mike 1.1 *end = '\0';
1178 elem->type = XML_CHARS;
1179 elem->data = start;
1180 elem->size = end - start;
1181
1182 /* Return 1 to indicate non-empty element */
1183 return 1;
1184 }
1185
1186 /*
1187 **==============================================================================
1188 **
1189 ** Public definitions
1190 **
1191 **==============================================================================
1192 */
1193
1194 const char* XML_Elem_GetAttr(
1195 XML_Elem* self,
1196 const char* name)
1197 {
1198 mike 1.1 size_t i;
1199
1200 for (i = 0; i < self->attrsSize; i++)
1201 {
1202 if (strcmp(name, self->attrs[i].name) == 0)
1203 return self->attrs[i].value;
1204 }
1205
1206 /* Not found! */
1207 return NULL;
1208 }
1209
1210 void XML_Elem_Dump(
1211 const XML_Elem* s)
1212 {
1213 static const char* _typeNames[] =
1214 {
1215 "NONE",
1216 "START",
1217 "END",
1218 "INSTRUCTION",
1219 mike 1.1 "CHARS",
1220 "COMMENT",
1221 };
1222 size_t i;
1223
1224 printf("==== XML_Elem:\n");
1225 printf("type={%s}\n", _typeNames[(int)s->type]);
1226 printf("data={%s}\n", s->data);
1227 printf("size=%u\n", (int)s->size);
1228
1229 if (s->attrsSize)
1230 {
1231 for (i = 0; i < s->attrsSize; i++)
1232 {
1233 const XML_Attr* attr = &s->attrs[i];
1234 printf("%s=%s\n", attr->name, attr->value);
1235 }
1236 }
1237
1238 putchar('\n');
1239 }
1240 mike 1.1
1241 void XML_NameSpace_Dump(
1242 XML_NameSpace* self)
1243 {
1244 printf("==== XML_NameSpace:\n");
1245 printf("name={%s}\n", self->name);
1246 printf("id={%c}\n", (int)self->id);
1247 printf("uri={%s}\n", self->uri);
1248 printf("depth={%u}\n", (int)self->depth);
1249 putchar('\n');
1250 }
1251
1252 void XML_Init(
1253 XML* self)
1254 {
1255 memset(self, 0, sizeof(XML));
1256
1257 self->nameSpacesCacheIndex = (size_t)-1;
1258 }
1259
1260 void XML_SetText(
1261 mike 1.1 XML* self,
1262 char* text)
1263 {
1264 self->text = text;
1265 self->ptr = text;
1266 self->line = 1;
1267 self->state = STATE_START;
1268 }
1269
1270 int XML_Next(
1271 XML* self,
1272 XML_Elem* elem)
1273 {
1274 if (self->elemStackSize)
1275 {
1276 *elem = self->elemStack[--self->elemStackSize];
1277 self->nesting--;
1278 return 0;
1279 }
1280
1281 elem->attrsSize = 0;
1282 mike 1.1
1283 for (;;)
1284 {
1285 switch (self->state)
1286 {
1287 case STATE_START:
1288 {
1289 char* p = self->ptr;
1290
1291 /* Skip spaces */
1292 p = _SkipSpaces(self, p);
1293
1294 /* Expect '<' */
1295 if (*p != '<')
1296 {
1297 XML_Raise(self, "expected opening angle bracket");
1298 return -1;
1299 }
1300
1301 self->ptr = p + 1;
1302 self->state = STATE_TAG;
1303 mike 1.1 break;
1304 }
1305 case STATE_TAG:
1306 {
1307 char* p = self->ptr;
1308
1309 /* Skip spaces */
1310 p = _SkipSpaces(self, p);
1311
1312 /* Expect one of these */
1313 if (*p == '/')
1314 {
1315 _ParseEndTag(self, elem, p);
1316 return self->status;
1317 }
1318 else if (_IsFirst(*p))
1319 {
1320 _ParseStartTag(self, elem, p);
1321 return self->status;
1322 }
1323 else if (*p == '?')
1324 mike 1.1 {
1325 _ParseProcessingInstruction(self, elem, p);
1326 return self->status;
1327 }
1328 else if (*p == '!')
1329 {
1330 p++;
1331
1332 if (p[0] == '-' && p[1] == '-')
1333 {
1334 _ParseComment(self, elem, p);
1335 return self->status;
1336 }
1337 else if (memcmp(p, "[CDATA[", 7) == 0)
1338 {
1339 _ParseCDATA(self, elem, p);
1340 return self->status;
1341 }
1342 else if (memcmp(p, "DOCTYPE", 7) == 0)
1343 {
1344 _ParseDOCTYPE(self, elem, p);
1345 mike 1.1
1346 if (self->status)
1347 return -1;
1348
1349 break;
1350 }
1351 else
1352 {
1353 XML_Raise(self, "expected comment, CDATA, or DOCTYPE");
1354 return -1;
1355 }
1356 }
1357 else
1358 {
1359 XML_Raise(self, "expected element");
1360 return-1;
1361 }
1362 break;
1363 }
1364 case STATE_CHARS:
1365 {
1366 mike 1.1 char* p = self->ptr;
1367
1368 if (_ParseCharData(self, elem, p) == 1)
1369 {
1370 /* Return character data to caller */
1371 return 0;
1372 }
1373
1374 if (self->status)
1375 return self->status;
1376
1377 /* empty character data */
1378 break;
1379 }
1380 default:
1381 {
1382 XML_Raise(self, "unexpected state");
1383 return -1;
1384 }
1385 }
1386 }
1387 mike 1.1
1388 //return 0;
1389 }
1390
1391 int XML_Expect(
1392 XML* self,
1393 XML_Elem* elem,
1394 XML_Type type,
1395 const char* name)
1396 {
1397 if (XML_Next(self, elem) == 0 &&
1398 elem->type == type &&
1399 (!name || strcmp(elem->data, name) == 0))
1400 {
1401 return 0;
1402 }
1403
1404 #if 0
1405 XML_Elem_Dump(elem);
1406 #endif
1407
1408 mike 1.1 if (type == XML_START)
1409 XML_Raise(self, "expected element: <%s>: %s", name, elem->data);
1410 else if (type == XML_END)
1411 XML_Raise(self, "expected element: </%s>: %s", name, elem->data);
1412 else if (type == XML_CHARS)
1413 XML_Raise(self, "expected character data");
1414
1415 return -1;
1416 }
1417
1418 int XML_Skip(
1419 XML* self)
1420 {
1421 XML_Elem tmp;
1422 size_t nesting = self->nesting;
1423
1424 while (self->nesting >= nesting)
1425 {
1426 if (XML_Next(self, &tmp) != 0)
1427 return -1;
1428 }
1429 mike 1.1
1430 return 0;
1431 }
1432
1433 int XML_RegisterNameSpace(
1434 XML* self,
1435 char id,
1436 const char* uri)
1437 {
1438 XML_RegisteredNameSpace rns;
1439 /* ATTN: we do not check for duplicates */
1440
1441 /* Reject out of range ids */
1442 if (id < 'a' || id > 'z')
1443 return -1;
1444
1445 /* Check for overflow of the array */
1446 if (self->registeredNameSpacesSize == XML_MAX_REGISTERED_NAMESPACES)
1447 return -1;
1448
1449 /* Reject zero-length URI's */
1450 mike 1.1 if (uri[0] == '\0')
1451 return -1;
1452
1453 rns.id = id;
1454 rns.uri = uri;
1455 rns.uriCode = _HashCode(uri, strlen(uri));
1456
1457 self->registeredNameSpaces[self->registeredNameSpacesSize++] = rns;
1458
1459 return 0;
1460 }
1461
1462 int XML_PutBack(
1463 XML* self,
1464 const XML_Elem* elem)
1465 {
1466 /* Check for stack overflow */
1467 if (self->elemStackSize == XML_MAX_NESTED)
1468 {
1469 XML_Raise(self, "element stack overflow (>%u)", XML_MAX_NESTED);
1470 return -1;
1471 mike 1.1 }
1472
1473 self->elemStack[self->elemStackSize++] = *elem;
1474 return 0;
1475 }
1476
1477 void XML_Dump(
1478 XML* self)
1479 {
1480 size_t i;
1481
1482 printf("==== XML:\n");
1483 printf("nameSpaces:\n");
1484
1485 for (i = 0; i < self->nameSpacesSize; i++)
1486 {
1487 XML_NameSpace_Dump(&self->nameSpaces[i]);
1488 }
1489
1490 putchar('\n');
1491 }
1492 mike 1.1
1493 void XML_PutError(XML* self)
1494 {
1495 if (self->status == -1)
1496 fprintf(stderr, "line %u: %s\n", (int)self->line, self->message);
1497 }
1498
1499 void XML_Raise(XML* self, const char* format, ...)
1500 {
1501 int n;
1502 va_list ap;
1503 memset(&ap, 0, sizeof(ap));
1504
1505 self->status = -1;
1506 self->message[0] = '\0';
1507
1508 va_start(ap, format);
1509 n = Vsnprintf(self->message, sizeof(self->message), format, ap);
1510 va_end(ap);
1511 }
1512
1513 mike 1.1 void XML_FormatError(XML* self, char* format, size_t size)
1514 {
1515 *format = '\0';
1516
1517 if (self->status == -1)
1518 Snprintf(format, size, "%u: error: %s", (unsigned int)self->line, self->message);
1519 }
|