1 mike 1.13 //%/////////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2000, 2001 The Open group, BMC Software, Tivoli Systems, IBM
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to
7 // deal in the Software without restriction, including without limitation the
8 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9 // sell copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // THE ABOVE COPYRIGHT NOTICE AND THIS PERMISSION NOTICE SHALL BE INCLUDED IN
13 // ALL COPIES OR SUBSTANTIAL PORTIONS OF THE SOFTWARE. THE SOFTWARE IS PROVIDED
14 // "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
15 // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
16 // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
17 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
18 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
19 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20 //
21 //==============================================================================
22 mike 1.13 //
23 // Author: Mike Brasher (mbrasher@bmc.com)
24 //
25 // Modified By:
26 //
27 //%/////////////////////////////////////////////////////////////////////////////
28
29 ////////////////////////////////////////////////////////////////////////////////
30 //
31 // XmlParser
32 //
33 // This file contains a simple non-validating XML parser. Here are
34 // serveral rules for well-formed XML:
35 //
36 // 1. Documents must begin with an XML declaration:
37 //
38 // <?xml version="1.0" standalone="yes"?>
39 //
40 // 2. Comments have the form:
41 //
42 // <!-- blah blah blah -->
43 mike 1.13 //
44 // 3. The following entity references are supported:
45 //
46 // & - ampersand
47 // < - less-than
48 // > - greater-than
49 // " - full quote
50 // &apos - apostrophe
51 //
52 // 4. Element names and attribute names take the following form:
53 //
54 // [A-Za-z_][A-Za-z_0-9-.:]
55 //
56 // 5. Arbitrary data (CDATA) can be enclosed like this:
57 //
58 // <![CDATA[
59 // ...
60 // ]]>
61 //
62 // 6. Element names and attributes names are case-sensitive.
63 //
64 mike 1.13 // 7. XmlAttribute values must be delimited by full or half quotes.
65 // XmlAttribute values must be delimited.
66 //
67 // 8. <!DOCTYPE...>
68 //
69 // TODO:
70 //
|
85 mike 1.13 #include <cctype>
86 #include <cassert>
87 #include <cstdio>
88 #include <cstdlib>
89 #include <cstring>
90 #include "XmlParser.h"
91 #include "Logger.h"
92
93 PEGASUS_NAMESPACE_BEGIN
94
95 #define PEGASUS_ARRAY_T XmlEntry
96 # include "ArrayImpl.h"
97 #undef PEGASUS_ARRAY_T
98
99
100 ////////////////////////////////////////////////////////////////////////////////
101 //
102 // Static helper functions
103 //
104 ////////////////////////////////////////////////////////////////////////////////
105
106 mike 1.13 static void _printValue(const char* p)
107 {
108 for (; *p; p++)
109 {
110 if (*p == '\n')
111 PEGASUS_STD(cout) << "\\n";
112 else if (*p == '\r')
113 PEGASUS_STD(cout) << "\\r";
114 else if (*p == '\t')
115 PEGASUS_STD(cout) << "\\t";
116 else
117 PEGASUS_STD(cout) << *p;
118 }
119 }
120
121 struct EntityReference
122 {
123 const char* match;
124 Uint32 length;
125 char replacement;
126 };
127 mike 1.13
128 static EntityReference _references[] =
129 {
130 { "&", 5, '&' },
131 { "<", 4, '<' },
132 { ">", 4, '>' },
133 { """, 6, '"' },
134 { "'", 6, '\'' }
135 };
136
137 static Uint32 _REFERENCES_SIZE = (sizeof(_references) / sizeof(_references[0]));
138
139 // Remove all redundant spaces from the given string:
140
141 static void _normalize(char* text)
142 {
143 Uint32 length = strlen(text);
144 char* p = text;
145 char* end = p + length;
146
147 // Remove leading spaces:
148 mike 1.13
149 while (isspace(*p))
150 p++;
151
152 if (p != text)
153 memmove(text, p, end - p + 1);
154
155 p = text;
156
157 // Look for sequences of more than one space and remove all but one.
158
159 for (;;)
160 {
161 // Advance to the next space:
162
163 while (*p && !isspace(*p))
164 p++;
165
166 if (!*p)
167 break;
168
169 mike 1.13 // Advance to the next non-space:
170
171 char* q = p++;
172
173 while (isspace(*p))
174 p++;
175
176 // Discard trailing spaces (if we are at the end):
177
178 if (!*p)
179 {
180 *q = '\0';
181 break;
182 }
183
184 // Remove the redundant spaces:
185
186 Uint32 n = p - q;
187
188 if (n > 1)
189 {
190 mike 1.13 *q++ = ' ';
191 memmove(q, p, end - p + 1);
192 p = q;
193 }
194 }
195 }
196
197 ////////////////////////////////////////////////////////////////////////////////
198 //
199 // XmlException
200 //
201 ////////////////////////////////////////////////////////////////////////////////
202
203 static const char* _xmlMessages[] =
204 {
205 "Bad opening element",
206 "Bad closing element",
207 "Bad attribute name",
208 "Exepected equal sign",
209 "Bad attribute value",
210 "A \"--\" sequence found within comment",
211 mike 1.13 "Unterminated comment",
212 "Unterminated CDATA block",
213 "Unterminated DOCTYPE",
214 "Too many attributes: parser only handles 10",
215 "Malformed reference",
216 "Expected a comment or CDATA following \"<!\" sequence",
217 "Closing element does not match opening element",
218 "One or more tags are still open",
219 "More than one root element was encountered",
220 "Validation error",
221 "Semantic error"
222 };
223
224 static String _formMessage(Uint32 code, Uint32 line, const String& message)
225 {
226 String result = _xmlMessages[Uint32(code) - 1];
227
228 char buffer[32];
229 sprintf(buffer, "%d", line);
230 result.append(": on line ");
231 result.append(buffer);
232 mike 1.13
233 if (message.size())
234 {
235 result.append(": ");
236 result.append(message);
237 }
238
239 return result;
240 }
241
242 XmlException::XmlException(
243 XmlException::Code code,
244 Uint32 lineNumber,
245 const String& message)
246 : Exception(_formMessage(code, lineNumber, message))
247 {
248
249 }
250
251 ////////////////////////////////////////////////////////////////////////////////
252 //
253 mike 1.13 // XmlValidationError
254 //
255 ////////////////////////////////////////////////////////////////////////////////
256
257 XmlValidationError::XmlValidationError(
258 Uint32 lineNumber,
259 const String& message)
260 : XmlException(XmlException::VALIDATION_ERROR, lineNumber, message)
261 {
262
263 }
264
265 ////////////////////////////////////////////////////////////////////////////////
266 //
267 // XmlSemanticError
268 //
269 ////////////////////////////////////////////////////////////////////////////////
270
271 XmlSemanticError::XmlSemanticError(
272 Uint32 lineNumber,
273 const String& message)
274 mike 1.13 : XmlException(XmlException::SEMANTIC_ERROR, lineNumber, message)
275 {
276
277 }
278
279 ////////////////////////////////////////////////////////////////////////////////
280 //
281 // XmlParser
282 //
283 ////////////////////////////////////////////////////////////////////////////////
284
285 XmlParser::XmlParser(char* text) : _line(1), _text(text), _current(text),
286 _restoreChar('\0'), _foundRoot(false)
287 {
288
289 }
290
291 Boolean XmlParser::next(XmlEntry& entry)
292 {
293 if (!_putBackStack.isEmpty())
294 {
295 mike 1.13 entry = _putBackStack.top();
296 _putBackStack.pop();
297 return true;
298 }
299
300 // If a character was overwritten with a null-terminator the last
301 // time this routine was called, then put back that character. Before
302 // exiting of course, restore the null-terminator.
303
304 char* nullTerminator = 0;
305
306 if (_restoreChar && !*_current)
307 {
308 nullTerminator = _current;
309 *_current = _restoreChar;
310 _restoreChar = '\0';
311 }
312
313 // Skip over any whitespace:
314
315 _skipWhitespace(_current);
316 mike 1.13
317 if (!*_current)
318 {
319 if (nullTerminator)
320 *nullTerminator = '\0';
321
322 if (!_stack.isEmpty())
323 throw XmlException(XmlException::UNCLOSED_TAGS, _line);
324
325 return false;
326 }
327
328 // Either a "<...>" or content begins next:
329
330 if (*_current == '<')
331 {
332 _current++;
333 _getElement(_current, entry);
334
335 if (nullTerminator)
336 *nullTerminator = '\0';
337 mike 1.13
338 if (entry.type == XmlEntry::START_TAG)
339 {
340 if (_stack.isEmpty() && _foundRoot)
341 throw XmlException(XmlException::MULTIPLE_ROOTS, _line);
342
343 _foundRoot = true;
344 _stack.push((char*)entry.text);
345 }
346 else if (entry.type == XmlEntry::END_TAG)
347 {
348 if (_stack.isEmpty())
349 throw XmlException(XmlException::START_END_MISMATCH, _line);
350
351 if (strcmp(_stack.top(), entry.text) != 0)
352 throw XmlException(XmlException::START_END_MISMATCH, _line);
353
354 _stack.pop();
355 }
356
357 return true;
358 mike 1.13 }
359 else
360 {
361 entry.type = XmlEntry::CONTENT;
362 entry.text = _current;
363 _getContent(_current);
364 _restoreChar = *_current;
365 *_current = '\0';
366
367 if (nullTerminator)
368 *nullTerminator = '\0';
369
370 _substituteReferences((char*)entry.text);
371 _normalize((char*)entry.text);
372
373 return true;
374 }
375 }
376
377 void XmlParser::putBack(XmlEntry& entry)
378 {
379 mike 1.13 _putBackStack.push(entry);
380 }
381
382 XmlParser::~XmlParser()
383 {
384 // Nothing to do!
385 }
386
387 void XmlParser::_skipWhitespace(char*& p)
388 {
389 while (*p && isspace(*p))
390 {
391 if (*p == '\n')
392 _line++;
393
394 p++;
395 }
396 }
397
398 Boolean XmlParser::_getElementName(char*& p)
399 {
400 mike 1.13 if (!isalpha(*p) && *p != '_')
401 throw XmlException(XmlException::BAD_START_TAG, _line);
402
403 while (*p &&
404 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
405 p++;
406
407 // The next character must be a space:
408
409 if (isspace(*p))
410 {
411 *p++ = '\0';
412 _skipWhitespace(p);
413 }
414
415 if (*p == '>')
416 {
417 *p++ = '\0';
418 return true;
419 }
420
421 mike 1.13 return false;
422 }
423
424 Boolean XmlParser::_getOpenElementName(char*& p, Boolean& openCloseElement)
425 {
426 openCloseElement = false;
427
428 if (!isalpha(*p) && *p != '_')
429 throw XmlException(XmlException::BAD_START_TAG, _line);
430
431 while (*p &&
432 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
433 p++;
434
435 // The next character must be a space:
436
437 if (isspace(*p))
438 {
439 *p++ = '\0';
440 _skipWhitespace(p);
441 }
442 mike 1.13
443 if (*p == '>')
444 {
445 *p++ = '\0';
446 return true;
447 }
448
449 if (p[0] == '/' && p[1] == '>')
450 {
451 openCloseElement = true;
452 *p = '\0';
453 p += 2;
454 return true;
455 }
456
457 return false;
458 }
459
460 void XmlParser::_getAttributeNameAndEqual(char*& p)
461 {
462 if (!isalpha(*p) && *p != '_')
463 mike 1.13 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
464
465 while (*p &&
466 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
467 p++;
468
469 char* term = p;
470
471 _skipWhitespace(p);
472
473 if (*p != '=')
474 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
475
476 p++;
477
478 _skipWhitespace(p);
479
480 *term = '\0';
481 }
482
483 void XmlParser::_getAttributeValue(char*& p)
484 mike 1.13 {
485 // ATTN-B: handle values contained in semiquotes:
486
487 if (*p != '"' && *p != '\'')
488 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
489
490 char startChar = *p++;
491
492 while (*p && *p != startChar)
493 p++;
494
495 if (*p != startChar)
496 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
497
498 *p++ = '\0';
499 }
500
501 void XmlParser::_getComment(char*& p)
502 {
503 // Now p points to first non-whitespace character beyond "<--" sequence:
504
505 mike 1.13 for (; *p; p++)
506 {
507 if (p[0] == '-' && p[1] == '-')
508 {
509 if (p[2] != '>')
510 {
511 throw XmlException(
512 XmlException::MINUS_MINUS_IN_COMMENT, _line);
513 }
514
515 // Find end of comment (excluding whitespace):
516
517 *p = '\0';
518 p += 3;
519 return;
520 }
521 }
522
523 // If it got this far, then the comment is unterminated:
524
525 throw XmlException(XmlException::UNTERMINATED_COMMENT, _line);
526 mike 1.13 }
527
528 void XmlParser::_getCData(char*& p)
529 {
530 // At this point p points one past "<![CDATA[" sequence:
531
532 for (; *p; p++)
533 {
534 if (p[0] == ']' && p[1] == ']' && p[2] == '>')
535 {
536 *p = '\0';
537 p += 3;
538 return;
539 }
540 else if (*p == '\n')
541 _line++;
542 }
543
544 // If it got this far, then the comment is unterminated:
545
546 throw XmlException(XmlException::UNTERMINATED_CDATA, _line);
547 mike 1.13 }
548
549 void XmlParser::_getDocType(char*& p)
550 {
551 // Just ignore the DOCTYPE command for now:
552
553 for (; *p && *p != '>'; p++)
554 {
555 if (*p == '\n')
556 _line++;
557 }
558
559 if (*p != '>')
560 throw XmlException(XmlException::UNTERMINATED_DOCTYPE, _line);
561
562 p++;
563 }
564
565 void XmlParser::_getContent(char*& p)
566 {
567 while (*p && *p != '<')
568 mike 1.13 {
569 if (*p == '\n')
570 _line++;
571
572 p++;
573 }
574 }
575
576 void XmlParser::_substituteReferences(char* text)
577 {
578 Uint32 rem = strlen(text);
579
580 for (char* p = text; *p; p++, rem--)
581 {
582 if (*p == '&')
583 {
584 // Look for predefined entity reference:
585
586 Boolean found = false;
587
588 for (Uint32 i = 0; i < _REFERENCES_SIZE; i++)
589 mike 1.13 {
590 Uint32 length = _references[i].length;
591 const char* match = _references[i].match;
592
593 if (strncmp(p, _references[i].match, length) == 0)
594 {
595 found = true;
596 *p = _references[i].replacement;
597 char* q = p + length;
598 rem = rem - length + 1;
599 memmove(p + 1, q, rem);
600 }
601 }
602
603 // If not found, then at least make sure it is well formed:
604
605 if (!found)
606 {
607 char* start = p;
608 p++;
609
610 mike 1.13 XmlException::Code code = XmlException::MALFORMED_REFERENCE;
611
612 if (isalpha(*p) || *p == '_')
613 {
614 for (p++; *p && *p != ';'; p++)
615 {
616 if (!isalnum(*p) && *p != '_')
617 throw XmlException(code, _line);
618 }
619 }
620 else if (*p == '#')
621 {
622 for (p++ ; *p && *p != ';'; p++)
623 {
624 if (!isdigit(*p))
625 throw XmlException(code, _line);
626 }
627 }
628
629 if (*p != ';')
630 throw XmlException(code, _line);
631 mike 1.13
632 rem -= p - start;
633 }
634 }
635 }
636 }
637
638 static const char _EMPTY_STRING[] = "";
639
640 void XmlParser::_getElement(char*& p, XmlEntry& entry)
641 {
642 entry.attributeCount = 0;
643
644 //--------------------------------------------------------------------------
645 // Get the element name (expect one of these: '?', '!', [A-Za-z_])
646 //--------------------------------------------------------------------------
647
648 if (*p == '?')
649 {
650 entry.type = XmlEntry::XML_DECLARATION;
651 entry.text = ++p;
652 mike 1.13
653 Boolean openCloseElement = false;
654
655 if (_getElementName(p))
656 return;
657 }
658 else if (*p == '!')
659 {
660 p++;
661
662 // Expect a comment or CDATA:
663
664 if (p[0] == '-' && p[1] == '-')
665 {
666 p += 2;
667 entry.type = XmlEntry::COMMENT;
668 entry.text = p;
669 _getComment(p);
670 return;
671 }
672 else if (memcmp(p, "[CDATA[", 7) == 0)
673 mike 1.13 {
674 p += 7;
675 entry.type = XmlEntry::CDATA;
676 entry.text = p;
677 _getCData(p);
678 return;
679 }
680 else if (memcmp(p, "DOCTYPE", 7) == 0)
681 {
682 entry.type = XmlEntry::DOCTYPE;
683 entry.text = _EMPTY_STRING;
684 _getDocType(p);
685 return;
686 }
687 throw(XmlException(XmlException::EXPECTED_COMMENT_OR_CDATA, _line));
688 }
689 else if (*p == '/')
690 {
691 entry.type = XmlEntry::END_TAG;
692 entry.text = ++p;
693
694 mike 1.13 if (!_getElementName(p))
695 throw(XmlException(XmlException::BAD_END_TAG, _line));
696
697 return;
698 }
699 else if (isalpha(*p) || *p == '_')
700 {
701 entry.type = XmlEntry::START_TAG;
702 entry.text = p;
703
704 Boolean openCloseElement = false;
705
706 if (_getOpenElementName(p, openCloseElement))
707 {
708 if (openCloseElement)
709 entry.type = XmlEntry::EMPTY_TAG;
710 return;
711 }
712 }
713 else
714 throw XmlException(XmlException::BAD_START_TAG, _line);
715 mike 1.13
716 //--------------------------------------------------------------------------
717 // Grab all the attributes:
718 //--------------------------------------------------------------------------
719
720 for (;;)
721 {
722 if (entry.type == XmlEntry::XML_DECLARATION)
723 {
724 if (p[0] == '?' && p[1] == '>')
725 {
726 p += 2;
727 return;
728 }
729 }
730 else if (entry.type == XmlEntry::START_TAG && p[0] == '/' && p[1] =='>')
731 {
732 entry.type = XmlEntry::EMPTY_TAG;
733 p += 2;
734 return;
735 }
736 mike 1.13 else if (*p == '>')
737 {
738 p++;
739 return;
740 }
741
742 XmlAttribute attr;
743 attr.name = p;
744 _getAttributeNameAndEqual(p);
745
746 if (*p != '"' && *p != '\'')
747 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
748
749 attr.value = p + 1;
750 _getAttributeValue(p);
751
752 if (entry.type == XmlEntry::XML_DECLARATION)
753 {
754 // The next thing must a space or a "?>":
755
756 if (!(p[0] == '?' && p[1] == '>') && !isspace(*p))
757 mike 1.13 {
758 throw XmlException(
759 XmlException::BAD_ATTRIBUTE_VALUE, _line);
760 }
761 }
762 else if (!(*p == '>' || (p[0] == '/' && p[1] == '>') || isspace(*p)))
763 {
764 // The next thing must be a space or a '>':
765
766 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
767 }
768
769 _skipWhitespace(p);
770
771 if (entry.attributeCount == XmlEntry::MAX_ATTRIBUTES)
772 throw XmlException(XmlException::TOO_MANY_ATTRIBUTES, _line);
773
774 _substituteReferences((char*)attr.value);
775 entry.attributes[entry.attributeCount++] = attr;
776 }
777 }
778 mike 1.13
779 static const char* _typeStrings[] =
780 {
781 "XML_DECLARATION",
782 "START_TAG",
783 "EMPTY_TAG",
784 "END_TAG",
785 "COMMENT",
786 "CDATA",
787 "DOCTYPE",
788 "CONTENT"
789 };
790
791 void XmlEntry::print() const
792 {
793 PEGASUS_STD(cout) << "=== " << _typeStrings[type] << " ";
794
795 Boolean needQuotes = type == XmlEntry::CDATA || type == XmlEntry::CONTENT;
796
797 if (needQuotes)
798 PEGASUS_STD(cout) << "\"";
799 mike 1.13
800 _printValue(text);
801
802 if (needQuotes)
803 PEGASUS_STD(cout) << "\"";
804
805 PEGASUS_STD(cout) << '\n';
806
807 for (Uint32 i = 0; i < attributeCount; i++)
808 {
809 PEGASUS_STD(cout) << " " << attributes[i].name << "=\"";
810 _printValue(attributes[i].value);
811 PEGASUS_STD(cout) << "\"" << PEGASUS_STD(endl);
812 }
813 }
814
815 const XmlAttribute* XmlEntry::findAttribute(
816 const char* name) const
817 {
818 for (Uint32 i = 0; i < attributeCount; i++)
819 {
820 mike 1.13 if (strcmp(attributes[i].name, name) == 0)
821 return &attributes[i];
822 }
823
824 return 0;
825 }
826
827 // Find first non-whitespace character (set first) and last non-whitespace
828 // character (set last one past this). For example, consider this string:
829 //
830 // " 87 "
831 //
832 // The first pointer would point to '8' and the last pointer woudl point one
833 // beyond '7'.
834
835 static void _findEnds(
836 const char* str,
837 const char*& first,
838 const char*& last)
839 {
840 first = str;
841 mike 1.13
842 while (isspace(*first))
843 first++;
844
845 if (!*first)
846 {
847 last = first;
848 return;
849 }
850
851 last = first + strlen(first);
852
853 while (last != first && isspace(last[-1]))
854 last--;
855 }
856
857 Boolean XmlEntry::getAttributeValue(
858 const char* name,
859 Uint32& value) const
860 {
861 const XmlAttribute* attr = findAttribute(name);
862 mike 1.13
863 if (!attr)
864 return false;
865
866 const char* first;
867 const char* last;
868 _findEnds(attr->value, first, last);
869
870 char* end = 0;
871 long tmp = strtol(first, &end, 10);
872
873 if (!end || end != last)
874 return false;
875
876 value = Uint32(tmp);
877 return true;
878 }
879
880 Boolean XmlEntry::getAttributeValue(
881 const char* name,
882 Real32& value) const
883 mike 1.13 {
884 const XmlAttribute* attr = findAttribute(name);
885
886 if (!attr)
887 return false;
888
889 const char* first;
890 const char* last;
891 _findEnds(attr->value, first, last);
892
893 char* end = 0;
894 double tmp = strtod(first, &end);
895
896 if (!end || end != last)
897 return false;
898
899 value = Uint32(tmp);
900 return true;
901 }
902
903 Boolean XmlEntry::getAttributeValue(
904 mike 1.13 const char* name,
905 const char*& value) const
906 {
907 const XmlAttribute* attr = findAttribute(name);
908
909 if (!attr)
910 return false;
911
912 value = attr->value;
913 return true;
914 }
915
916 Boolean XmlEntry::getAttributeValue(const char* name, String& value) const
917 {
918 const char* tmp;
919
920 if (!getAttributeValue(name, tmp))
921 return false;
922
923 value = tmp;
924 return true;
925 mike 1.13 }
926
927 void XmlAppendCString(Array<Sint8>& out, const char* str)
928 {
929 out.append(str, strlen(str));
930 }
931
932 PEGASUS_NAMESPACE_END
|