132 mike 1.13 static EntityReference _references[] =
133 {
134 { "&", 5, '&' },
135 { "<", 4, '<' },
136 { ">", 4, '>' },
137 { """, 6, '"' },
138 { "'", 6, '\'' }
139 };
140
141 static Uint32 _REFERENCES_SIZE = (sizeof(_references) / sizeof(_references[0]));
142
143 // Remove all redundant spaces from the given string:
144
145 static void _normalize(char* text)
146 {
147 Uint32 length = strlen(text);
148 char* p = text;
149 char* end = p + length;
150
151 // Remove leading spaces:
152
153 mike 1.13 while (isspace(*p))
154 p++;
155
156 if (p != text)
157 memmove(text, p, end - p + 1);
158
159 p = text;
160
161 // Look for sequences of more than one space and remove all but one.
162
163 for (;;)
164 {
165 // Advance to the next space:
166
167 while (*p && !isspace(*p))
168 p++;
169
170 if (!*p)
171 break;
172
173 // Advance to the next non-space:
174 mike 1.13
175 char* q = p++;
176
177 while (isspace(*p))
178 p++;
179
180 // Discard trailing spaces (if we are at the end):
181
182 if (!*p)
183 {
184 *q = '\0';
185 break;
186 }
187
188 // Remove the redundant spaces:
189
190 Uint32 n = p - q;
191
192 if (n > 1)
193 {
194 *q++ = ' ';
195 mike 1.13 memmove(q, p, end - p + 1);
196 p = q;
197 }
198 }
199 }
200
201 ////////////////////////////////////////////////////////////////////////////////
202 //
203 // XmlException
204 //
205 ////////////////////////////////////////////////////////////////////////////////
206
207 static const char* _xmlMessages[] =
208 {
209 "Bad opening element",
210 "Bad closing element",
211 "Bad attribute name",
212 "Exepected equal sign",
213 "Bad attribute value",
214 "A \"--\" sequence found within comment",
215 "Unterminated comment",
216 mike 1.13 "Unterminated CDATA block",
217 "Unterminated DOCTYPE",
218 "Too many attributes: parser only handles 10",
219 "Malformed reference",
220 "Expected a comment or CDATA following \"<!\" sequence",
221 "Closing element does not match opening element",
222 "One or more tags are still open",
223 "More than one root element was encountered",
224 "Validation error",
225 "Semantic error"
226 };
227
228 static String _formMessage(Uint32 code, Uint32 line, const String& message)
229 {
230 String result = _xmlMessages[Uint32(code) - 1];
231
232 char buffer[32];
233 sprintf(buffer, "%d", line);
234 result.append(": on line ");
235 result.append(buffer);
236
237 mike 1.13 if (message.size())
238 {
239 result.append(": ");
240 result.append(message);
241 }
242
243 return result;
244 }
245
246 XmlException::XmlException(
247 XmlException::Code code,
248 Uint32 lineNumber,
249 const String& message)
250 : Exception(_formMessage(code, lineNumber, message))
251 {
252
253 }
254
255 ////////////////////////////////////////////////////////////////////////////////
256 //
257 // XmlValidationError
258 mike 1.13 //
259 ////////////////////////////////////////////////////////////////////////////////
260
261 XmlValidationError::XmlValidationError(
262 Uint32 lineNumber,
263 const String& message)
264 : XmlException(XmlException::VALIDATION_ERROR, lineNumber, message)
265 {
266
267 }
268
269 ////////////////////////////////////////////////////////////////////////////////
270 //
271 // XmlSemanticError
272 //
273 ////////////////////////////////////////////////////////////////////////////////
274
275 XmlSemanticError::XmlSemanticError(
276 Uint32 lineNumber,
277 const String& message)
278 : XmlException(XmlException::SEMANTIC_ERROR, lineNumber, message)
279 mike 1.13 {
280
281 }
282
283 ////////////////////////////////////////////////////////////////////////////////
284 //
285 // XmlParser
286 //
287 ////////////////////////////////////////////////////////////////////////////////
288
289 XmlParser::XmlParser(char* text) : _line(1), _text(text), _current(text),
290 _restoreChar('\0'), _foundRoot(false)
291 {
292
293 }
294
295 Boolean XmlParser::next(XmlEntry& entry)
296 {
297 if (!_putBackStack.isEmpty())
298 {
299 entry = _putBackStack.top();
300 mike 1.13 _putBackStack.pop();
301 return true;
302 }
303
304 // If a character was overwritten with a null-terminator the last
305 // time this routine was called, then put back that character. Before
306 // exiting of course, restore the null-terminator.
307
308 char* nullTerminator = 0;
309
310 if (_restoreChar && !*_current)
311 {
312 nullTerminator = _current;
313 *_current = _restoreChar;
314 _restoreChar = '\0';
315 }
316
317 // Skip over any whitespace:
318
319 _skipWhitespace(_current);
320
321 mike 1.13 if (!*_current)
322 {
323 if (nullTerminator)
324 *nullTerminator = '\0';
325
326 if (!_stack.isEmpty())
327 throw XmlException(XmlException::UNCLOSED_TAGS, _line);
328
329 return false;
330 }
331
332 // Either a "<...>" or content begins next:
333
334 if (*_current == '<')
335 {
336 _current++;
337 _getElement(_current, entry);
338
339 if (nullTerminator)
340 *nullTerminator = '\0';
341
342 mike 1.13 if (entry.type == XmlEntry::START_TAG)
343 {
344 if (_stack.isEmpty() && _foundRoot)
345 throw XmlException(XmlException::MULTIPLE_ROOTS, _line);
346
347 _foundRoot = true;
348 _stack.push((char*)entry.text);
349 }
350 else if (entry.type == XmlEntry::END_TAG)
351 {
352 if (_stack.isEmpty())
353 throw XmlException(XmlException::START_END_MISMATCH, _line);
354
355 if (strcmp(_stack.top(), entry.text) != 0)
356 throw XmlException(XmlException::START_END_MISMATCH, _line);
357
358 _stack.pop();
359 }
360
361 return true;
362 }
363 mike 1.13 else
364 {
365 entry.type = XmlEntry::CONTENT;
366 entry.text = _current;
367 _getContent(_current);
368 _restoreChar = *_current;
369 *_current = '\0';
370
371 if (nullTerminator)
372 *nullTerminator = '\0';
373
374 _substituteReferences((char*)entry.text);
375 _normalize((char*)entry.text);
376
377 return true;
378 }
379 }
380
381 void XmlParser::putBack(XmlEntry& entry)
382 {
383 _putBackStack.push(entry);
384 mike 1.13 }
385
386 XmlParser::~XmlParser()
387 {
388 // Nothing to do!
389 }
390
391 void XmlParser::_skipWhitespace(char*& p)
392 {
393 while (*p && isspace(*p))
394 {
395 if (*p == '\n')
396 _line++;
397
398 p++;
399 }
400 }
401
402 Boolean XmlParser::_getElementName(char*& p)
403 {
404 if (!isalpha(*p) && *p != '_')
405 mike 1.13 throw XmlException(XmlException::BAD_START_TAG, _line);
406
407 while (*p &&
408 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
409 p++;
410
411 // The next character must be a space:
412
413 if (isspace(*p))
414 {
415 *p++ = '\0';
416 _skipWhitespace(p);
417 }
418
419 if (*p == '>')
420 {
421 *p++ = '\0';
422 return true;
423 }
424
425 return false;
426 mike 1.13 }
427
428 Boolean XmlParser::_getOpenElementName(char*& p, Boolean& openCloseElement)
429 {
430 openCloseElement = false;
431
432 if (!isalpha(*p) && *p != '_')
433 throw XmlException(XmlException::BAD_START_TAG, _line);
434
435 while (*p &&
436 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
437 p++;
438
439 // The next character must be a space:
440
441 if (isspace(*p))
442 {
443 *p++ = '\0';
444 _skipWhitespace(p);
445 }
446
447 mike 1.13 if (*p == '>')
448 {
449 *p++ = '\0';
450 return true;
451 }
452
453 if (p[0] == '/' && p[1] == '>')
454 {
455 openCloseElement = true;
456 *p = '\0';
457 p += 2;
458 return true;
459 }
460
461 return false;
462 }
463
464 void XmlParser::_getAttributeNameAndEqual(char*& p)
465 {
466 if (!isalpha(*p) && *p != '_')
467 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
468 mike 1.13
469 while (*p &&
470 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
471 p++;
472
473 char* term = p;
474
475 _skipWhitespace(p);
476
477 if (*p != '=')
478 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
479
480 p++;
481
482 _skipWhitespace(p);
483
484 *term = '\0';
485 }
486
487 void XmlParser::_getAttributeValue(char*& p)
488 {
489 mike 1.13 // ATTN-B: handle values contained in semiquotes:
490
491 if (*p != '"' && *p != '\'')
492 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
493
494 char startChar = *p++;
495
496 while (*p && *p != startChar)
497 p++;
498
499 if (*p != startChar)
500 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
501
502 *p++ = '\0';
503 }
504
505 void XmlParser::_getComment(char*& p)
506 {
507 // Now p points to first non-whitespace character beyond "<--" sequence:
508
509 for (; *p; p++)
510 mike 1.13 {
511 if (p[0] == '-' && p[1] == '-')
512 {
513 if (p[2] != '>')
514 {
515 throw XmlException(
516 XmlException::MINUS_MINUS_IN_COMMENT, _line);
517 }
518
519 // Find end of comment (excluding whitespace):
520
521 *p = '\0';
522 p += 3;
523 return;
524 }
525 }
526
527 // If it got this far, then the comment is unterminated:
528
529 throw XmlException(XmlException::UNTERMINATED_COMMENT, _line);
530 }
531 mike 1.13
532 void XmlParser::_getCData(char*& p)
533 {
534 // At this point p points one past "<![CDATA[" sequence:
535
536 for (; *p; p++)
537 {
538 if (p[0] == ']' && p[1] == ']' && p[2] == '>')
539 {
540 *p = '\0';
541 p += 3;
542 return;
543 }
544 else if (*p == '\n')
545 _line++;
546 }
547
548 // If it got this far, then the comment is unterminated:
549
550 throw XmlException(XmlException::UNTERMINATED_CDATA, _line);
551 }
552 mike 1.13
553 void XmlParser::_getDocType(char*& p)
554 {
555 // Just ignore the DOCTYPE command for now:
556
557 for (; *p && *p != '>'; p++)
558 {
559 if (*p == '\n')
560 _line++;
561 }
562
563 if (*p != '>')
564 throw XmlException(XmlException::UNTERMINATED_DOCTYPE, _line);
565
566 p++;
567 }
568
569 void XmlParser::_getContent(char*& p)
570 {
571 while (*p && *p != '<')
572 {
573 mike 1.13 if (*p == '\n')
574 _line++;
575
576 p++;
577 }
578 }
579
580 void XmlParser::_substituteReferences(char* text)
581 {
582 Uint32 rem = strlen(text);
583
584 for (char* p = text; *p; p++, rem--)
585 {
586 if (*p == '&')
587 {
|
590 kumpf 1.18 Uint16 referenceChar = 0;
591 Uint32 referenceLength = 0;
592 XmlException::Code code = XmlException::MALFORMED_REFERENCE;
593
594 if (*(p+1) == '#')
595 {
596 // Found a character (numeric) reference
597 // Determine whether it is decimal or hex
598 if (*(p+2) == 'x')
599 {
600 // Decode a hexadecimal character reference
601 char* q = p+3;
602
603 // At most four digits are allowed, plus trailing ';'
604 Uint32 numDigits;
605 for (numDigits = 0; numDigits < 5; numDigits++, q++)
606 {
607 if (isdigit(*q))
608 {
609 referenceChar = (referenceChar << 4);
610 referenceChar += (*q - '0');
611 kumpf 1.18 }
612 else if ((*q >= 'A') && (*q <= 'F'))
613 {
614 referenceChar = (referenceChar << 4);
615 referenceChar += (*q - 'A' + 10);
616 }
617 else if ((*q >= 'a') && (*q <= 'f'))
618 {
619 referenceChar = (referenceChar << 4);
620 referenceChar += (*q - 'a' + 10);
621 }
622 else if (*q == ';')
623 {
624 break;
625 }
626 else
627 {
628 throw XmlException(code, _line);
629 }
630 }
631
632 kumpf 1.18 // Hex number must be 1 - 4 digits
633 if ((numDigits == 0) || (numDigits > 4))
634 {
635 throw XmlException(code, _line);
636 }
637
638 // ATTN: Currently do not support 16-bit characters
639 if (referenceChar > 0xff)
640 {
641 // ATTN: Is there a good way to say "unsupported"?
642 throw XmlException(code, _line);
643 }
644
645 referenceLength = numDigits + 4;
646 }
647 else
648 {
649 // Decode a decimal character reference
650 Uint32 newChar = 0;
651 char* q = p+2;
652
653 kumpf 1.18 // At most five digits are allowed, plus trailing ';'
654 Uint32 numDigits;
655 for (numDigits = 0; numDigits < 6; numDigits++, q++)
656 {
657 if (isdigit(*q))
658 {
659 newChar = (newChar * 10);
660 newChar += (*q - '0');
661 }
662 else if (*q == ';')
663 {
664 break;
665 }
666 else
667 {
668 throw XmlException(code, _line);
669 }
670 }
671
672 // Decimal number must be 1 - 5 digits and fit in 16 bits
673 if ((numDigits == 0) || (numDigits > 5) ||
674 kumpf 1.18 (newChar > 0xffff))
675 {
676 throw XmlException(code, _line);
677 }
678
679 // ATTN: Currently do not support 16-bit characters
680 if (newChar > 0xff)
681 {
682 // ATTN: Is there a good way to say "unsupported"?
683 throw XmlException(code, _line);
684 }
685
686 referenceChar = Uint16(newChar);
687 referenceLength = numDigits + 3;
688 }
689 }
690 else
691 {
692 // Check for entity reference
693 // ATTN: Inefficient if many entity references are supported
694 Uint32 i;
695 kumpf 1.18 for (i = 0; i < _REFERENCES_SIZE; i++)
696 {
697 Uint32 length = _references[i].length;
698 const char* match = _references[i].match;
699
700 if (strncmp(p, _references[i].match, length) == 0)
701 {
702 referenceChar = _references[i].replacement;
703 referenceLength = length;
704 break;
705 }
706 }
707
708 if (i == _REFERENCES_SIZE)
709 {
710 // Didn't recognize the entity reference
711 // ATTN: Is there a good way to say "unsupported"?
712 throw XmlException(code, _line);
713 }
714 }
715
716 kumpf 1.18 // Replace the reference with the correct character
717 *p = (char)referenceChar;
718 char* q = p + referenceLength;
719 rem = rem - referenceLength + 1;
720 memmove(p + 1, q, rem);
|
721 mike 1.13 }
722 }
723 }
724
725 static const char _EMPTY_STRING[] = "";
726
727 void XmlParser::_getElement(char*& p, XmlEntry& entry)
728 {
729 entry.attributeCount = 0;
730
731 //--------------------------------------------------------------------------
732 // Get the element name (expect one of these: '?', '!', [A-Za-z_])
733 //--------------------------------------------------------------------------
734
735 if (*p == '?')
736 {
737 entry.type = XmlEntry::XML_DECLARATION;
738 entry.text = ++p;
739
740 Boolean openCloseElement = false;
741
742 mike 1.13 if (_getElementName(p))
743 return;
744 }
745 else if (*p == '!')
746 {
747 p++;
748
749 // Expect a comment or CDATA:
750
751 if (p[0] == '-' && p[1] == '-')
752 {
753 p += 2;
754 entry.type = XmlEntry::COMMENT;
755 entry.text = p;
756 _getComment(p);
757 return;
758 }
759 else if (memcmp(p, "[CDATA[", 7) == 0)
760 {
761 p += 7;
762 entry.type = XmlEntry::CDATA;
763 mike 1.13 entry.text = p;
764 _getCData(p);
765 return;
766 }
767 else if (memcmp(p, "DOCTYPE", 7) == 0)
768 {
769 entry.type = XmlEntry::DOCTYPE;
770 entry.text = _EMPTY_STRING;
771 _getDocType(p);
772 return;
773 }
774 throw(XmlException(XmlException::EXPECTED_COMMENT_OR_CDATA, _line));
775 }
776 else if (*p == '/')
777 {
778 entry.type = XmlEntry::END_TAG;
779 entry.text = ++p;
780
781 if (!_getElementName(p))
782 throw(XmlException(XmlException::BAD_END_TAG, _line));
783
784 mike 1.13 return;
785 }
786 else if (isalpha(*p) || *p == '_')
787 {
788 entry.type = XmlEntry::START_TAG;
789 entry.text = p;
790
791 Boolean openCloseElement = false;
792
793 if (_getOpenElementName(p, openCloseElement))
794 {
795 if (openCloseElement)
796 entry.type = XmlEntry::EMPTY_TAG;
797 return;
798 }
799 }
800 else
801 throw XmlException(XmlException::BAD_START_TAG, _line);
802
803 //--------------------------------------------------------------------------
804 // Grab all the attributes:
805 mike 1.13 //--------------------------------------------------------------------------
806
807 for (;;)
808 {
809 if (entry.type == XmlEntry::XML_DECLARATION)
810 {
811 if (p[0] == '?' && p[1] == '>')
812 {
813 p += 2;
814 return;
815 }
816 }
817 else if (entry.type == XmlEntry::START_TAG && p[0] == '/' && p[1] =='>')
818 {
819 entry.type = XmlEntry::EMPTY_TAG;
820 p += 2;
821 return;
822 }
823 else if (*p == '>')
824 {
825 p++;
826 mike 1.13 return;
827 }
828
829 XmlAttribute attr;
830 attr.name = p;
831 _getAttributeNameAndEqual(p);
832
833 if (*p != '"' && *p != '\'')
834 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
835
836 attr.value = p + 1;
837 _getAttributeValue(p);
838
839 if (entry.type == XmlEntry::XML_DECLARATION)
840 {
841 // The next thing must a space or a "?>":
842
843 if (!(p[0] == '?' && p[1] == '>') && !isspace(*p))
844 {
845 throw XmlException(
846 XmlException::BAD_ATTRIBUTE_VALUE, _line);
847 mike 1.13 }
848 }
849 else if (!(*p == '>' || (p[0] == '/' && p[1] == '>') || isspace(*p)))
850 {
851 // The next thing must be a space or a '>':
852
853 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
854 }
855
856 _skipWhitespace(p);
857
858 if (entry.attributeCount == XmlEntry::MAX_ATTRIBUTES)
859 throw XmlException(XmlException::TOO_MANY_ATTRIBUTES, _line);
860
861 _substituteReferences((char*)attr.value);
862 entry.attributes[entry.attributeCount++] = attr;
863 }
864 }
865
866 static const char* _typeStrings[] =
867 {
868 mike 1.13 "XML_DECLARATION",
869 "START_TAG",
870 "EMPTY_TAG",
871 "END_TAG",
872 "COMMENT",
873 "CDATA",
874 "DOCTYPE",
875 "CONTENT"
876 };
877
878 void XmlEntry::print() const
879 {
880 PEGASUS_STD(cout) << "=== " << _typeStrings[type] << " ";
881
882 Boolean needQuotes = type == XmlEntry::CDATA || type == XmlEntry::CONTENT;
883
884 if (needQuotes)
885 PEGASUS_STD(cout) << "\"";
886
887 _printValue(text);
888
889 mike 1.13 if (needQuotes)
890 PEGASUS_STD(cout) << "\"";
891
892 PEGASUS_STD(cout) << '\n';
893
894 for (Uint32 i = 0; i < attributeCount; i++)
895 {
896 PEGASUS_STD(cout) << " " << attributes[i].name << "=\"";
897 _printValue(attributes[i].value);
898 PEGASUS_STD(cout) << "\"" << PEGASUS_STD(endl);
899 }
900 }
901
902 const XmlAttribute* XmlEntry::findAttribute(
903 const char* name) const
904 {
905 for (Uint32 i = 0; i < attributeCount; i++)
906 {
907 if (strcmp(attributes[i].name, name) == 0)
908 return &attributes[i];
909 }
910 mike 1.13
911 return 0;
912 }
913
914 // Find first non-whitespace character (set first) and last non-whitespace
915 // character (set last one past this). For example, consider this string:
916 //
917 // " 87 "
918 //
919 // The first pointer would point to '8' and the last pointer woudl point one
920 // beyond '7'.
921
922 static void _findEnds(
923 const char* str,
924 const char*& first,
925 const char*& last)
926 {
927 first = str;
928
929 while (isspace(*first))
930 first++;
931 mike 1.13
932 if (!*first)
933 {
934 last = first;
935 return;
936 }
937
938 last = first + strlen(first);
939
940 while (last != first && isspace(last[-1]))
941 last--;
942 }
943
944 Boolean XmlEntry::getAttributeValue(
945 const char* name,
946 Uint32& value) const
947 {
948 const XmlAttribute* attr = findAttribute(name);
949
950 if (!attr)
951 return false;
952 mike 1.13
953 const char* first;
954 const char* last;
955 _findEnds(attr->value, first, last);
956
957 char* end = 0;
958 long tmp = strtol(first, &end, 10);
959
960 if (!end || end != last)
961 return false;
962
963 value = Uint32(tmp);
964 return true;
965 }
966
967 Boolean XmlEntry::getAttributeValue(
968 const char* name,
969 Real32& value) const
970 {
971 const XmlAttribute* attr = findAttribute(name);
972
973 mike 1.13 if (!attr)
974 return false;
975
976 const char* first;
977 const char* last;
978 _findEnds(attr->value, first, last);
979
980 char* end = 0;
981 double tmp = strtod(first, &end);
982
983 if (!end || end != last)
984 return false;
985
986 value = Uint32(tmp);
987 return true;
988 }
989
990 Boolean XmlEntry::getAttributeValue(
991 const char* name,
992 const char*& value) const
993 {
994 mike 1.13 const XmlAttribute* attr = findAttribute(name);
995
996 if (!attr)
997 return false;
998
999 value = attr->value;
1000 return true;
1001 }
1002
1003 Boolean XmlEntry::getAttributeValue(const char* name, String& value) const
1004 {
1005 const char* tmp;
1006
1007 if (!getAttributeValue(name, tmp))
1008 return false;
1009
1010 value = tmp;
1011 return true;
1012 }
1013
1014 void XmlAppendCString(Array<Sint8>& out, const char* str)
1015 mike 1.13 {
1016 out.append(str, strlen(str));
1017 }
1018
1019 PEGASUS_NAMESPACE_END
|