1 //%/////////////////////////////////////////////////////////////////////////////
\r
3 // Copyright (c) 2000, 2001 The Open group, BMC Software, Tivoli Systems, IBM
\r
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
\r
6 // of this software and associated documentation files (the "Software"), to
\r
7 // deal in the Software without restriction, including without limitation the
\r
8 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
\r
9 // sell copies of the Software, and to permit persons to whom the Software is
\r
10 // furnished to do so, subject to the following conditions:
\r
12 // THE ABOVE COPYRIGHT NOTICE AND THIS PERMISSION NOTICE SHALL BE INCLUDED IN
\r
13 // ALL COPIES OR SUBSTANTIAL PORTIONS OF THE SOFTWARE. THE SOFTWARE IS PROVIDED
\r
14 // "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
\r
15 // LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
\r
16 // PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
\r
17 // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
\r
18 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
\r
19 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\r
21 //==============================================================================
\r
23 // Author: Mike Brasher (mbrasher@bmc.com)
\r
27 //%/////////////////////////////////////////////////////////////////////////////
\r
29 ////////////////////////////////////////////////////////////////////////////////
\r
33 // This file contains a simple non-validating XML parser. Here are
\r
34 // serveral rules for well-formed XML:
\r
36 // 1. Documents must begin with an XML declaration:
\r
38 // <?xml version="1.0" standalone="yes"?>
\r
40 // 2. Comments have the form:
\r
42 // <!-- blah blah blah -->
\r
44 // 3. The following entity references are supported:
\r
48 // > - greater-than
\r
49 // " - full quote
\r
50 // &apos - apostrophe
\r
52 // 4. Element names and attribute names take the following form:
\r
54 // [A-Za-z_][A-Za-z_0-9-.:]
\r
56 // 5. Arbitrary data (CDATA) can be enclosed like this:
\r
62 // 6. Element names and attributes names are case-sensitive.
\r
64 // 7. XmlAttribute values must be delimited by full or half quotes.
\r
65 // XmlAttribute values must be delimited.
\r
71 // Handle <!DOCTYPE...> sections which are complicated (containing
\r
72 // rules rather than references to files).
\r
74 // Handle reference of this form: "Α"
\r
76 // Remove newlines from string literals:
\r
78 // Example: <xyz x="hello
\r
81 ////////////////////////////////////////////////////////////////////////////////
\r
88 #include "XmlParser.h"
\r
91 PEGASUS_NAMESPACE_BEGIN
\r
93 #define PEGASUS_ARRAY_T XmlEntry
\r
94 # include "ArrayImpl.h"
\r
95 #undef PEGASUS_ARRAY_T
\r
98 ////////////////////////////////////////////////////////////////////////////////
\r
100 // Static helper functions
\r
102 ////////////////////////////////////////////////////////////////////////////////
\r
104 static void _printValue(const char* p)
\r
109 PEGASUS_STD(cout) << "\\n";
\r
110 else if (*p == '\r')
\r
111 PEGASUS_STD(cout) << "\\r";
\r
112 else if (*p == '\t')
\r
113 PEGASUS_STD(cout) << "\\t";
\r
115 PEGASUS_STD(cout) << *p;
\r
119 struct EntityReference
\r
126 static EntityReference _references[] =
\r
128 { "&", 5, '&' },
\r
129 { "<", 4, '<' },
\r
130 { ">", 4, '>' },
\r
131 { """, 6, '"' },
\r
132 { "'", 6, '\'' }
\r
135 static Uint32 _REFERENCES_SIZE = (sizeof(_references) / sizeof(_references[0]));
\r
137 // Remove all redundant spaces from the given string:
\r
139 static void _normalize(char* text)
\r
141 Uint32 length = strlen(text);
\r
143 char* end = p + length;
\r
145 // Remove leading spaces:
\r
147 while (isspace(*p))
\r
151 memmove(text, p, end - p + 1);
\r
155 // Look for sequences of more than one space and remove all but one.
\r
159 // Advance to the next space:
\r
161 while (*p && !isspace(*p))
\r
167 // Advance to the next non-space:
\r
171 while (isspace(*p))
\r
174 // Discard trailing spaces (if we are at the end):
\r
182 // Remove the redundant spaces:
\r
189 memmove(q, p, end - p + 1);
\r
195 ////////////////////////////////////////////////////////////////////////////////
\r
199 ////////////////////////////////////////////////////////////////////////////////
\r
201 static const char* _xmlMessages[] =
\r
203 "Bad opening element",
\r
204 "Bad closing element",
\r
205 "Bad attribute name",
\r
206 "Exepected equal sign",
\r
207 "Bad attribute value",
\r
208 "A \"--\" sequence found within comment",
\r
209 "Unterminated comment",
\r
210 "Unterminated CDATA block",
\r
211 "Unterminated DOCTYPE",
\r
212 "Too many attributes: parser only handles 10",
\r
213 "Malformed reference",
\r
214 "Expected a comment or CDATA following \"<!\" sequence",
\r
215 "Closing element does not match opening element",
\r
216 "One or more tags are still open",
\r
217 "More than one root element was encountered",
\r
218 "Validation error",
\r
222 static String _formMessage(Uint32 code, Uint32 line, const String& message)
\r
224 String result = _xmlMessages[Uint32(code) - 1];
\r
227 sprintf(buffer, "%d", line);
\r
228 result.append(": on line ");
\r
229 result.append(buffer);
\r
231 if (message.size())
\r
233 result.append(": ");
\r
234 result.append(message);
\r
240 XmlException::XmlException(
\r
241 XmlException::Code code,
\r
243 const String& message)
\r
244 : Exception(_formMessage(code, lineNumber, message))
\r
249 ////////////////////////////////////////////////////////////////////////////////
\r
251 // XmlValidationError
\r
253 ////////////////////////////////////////////////////////////////////////////////
\r
255 XmlValidationError::XmlValidationError(
\r
257 const String& message)
\r
258 : XmlException(XmlException::VALIDATION_ERROR, lineNumber, message)
\r
263 ////////////////////////////////////////////////////////////////////////////////
\r
265 // XmlSemanticError
\r
267 ////////////////////////////////////////////////////////////////////////////////
\r
269 XmlSemanticError::XmlSemanticError(
\r
271 const String& message)
\r
272 : XmlException(XmlException::SEMANTIC_ERROR, lineNumber, message)
\r
277 ////////////////////////////////////////////////////////////////////////////////
\r
281 ////////////////////////////////////////////////////////////////////////////////
\r
283 XmlParser::XmlParser(char* text) : _line(1), _text(text), _current(text),
\r
284 _restoreChar('\0'), _foundRoot(false)
\r
289 Boolean XmlParser::next(XmlEntry& entry)
\r
291 if (!_putBackStack.isEmpty())
\r
293 entry = _putBackStack.top();
\r
294 _putBackStack.pop();
\r
298 // If a character was overwritten with a null-terminator the last
\r
299 // time this routine was called, then put back that character. Before
\r
300 // exiting of course, restore the null-terminator.
\r
302 char* nullTerminator = 0;
\r
304 if (_restoreChar && !*_current)
\r
306 nullTerminator = _current;
\r
307 *_current = _restoreChar;
\r
308 _restoreChar = '\0';
\r
311 // Skip over any whitespace:
\r
313 _skipWhitespace(_current);
\r
317 if (nullTerminator)
\r
318 *nullTerminator = '\0';
\r
320 if (!_stack.isEmpty())
\r
321 throw XmlException(XmlException::UNCLOSED_TAGS, _line);
\r
326 // Either a "<...>" or content begins next:
\r
328 if (*_current == '<')
\r
331 _getElement(_current, entry);
\r
333 if (nullTerminator)
\r
334 *nullTerminator = '\0';
\r
336 if (entry.type == XmlEntry::START_TAG)
\r
338 if (_stack.isEmpty() && _foundRoot)
\r
339 throw XmlException(XmlException::MULTIPLE_ROOTS, _line);
\r
342 _stack.push((char*)entry.text);
\r
344 else if (entry.type == XmlEntry::END_TAG)
\r
346 if (_stack.isEmpty())
\r
347 throw XmlException(XmlException::START_END_MISMATCH, _line);
\r
349 if (strcmp(_stack.top(), entry.text) != 0)
\r
350 throw XmlException(XmlException::START_END_MISMATCH, _line);
\r
359 entry.type = XmlEntry::CONTENT;
\r
360 entry.text = _current;
\r
361 _getContent(_current);
\r
362 _restoreChar = *_current;
\r
365 if (nullTerminator)
\r
366 *nullTerminator = '\0';
\r
368 _substituteReferences((char*)entry.text);
\r
369 _normalize((char*)entry.text);
\r
375 void XmlParser::putBack(XmlEntry& entry)
\r
377 _putBackStack.push(entry);
\r
380 XmlParser::~XmlParser()
\r
385 void XmlParser::_skipWhitespace(char*& p)
\r
387 while (*p && isspace(*p))
\r
396 Boolean XmlParser::_getElementName(char*& p)
\r
398 if (!isalpha(*p) && *p != '_')
\r
399 throw XmlException(XmlException::BAD_START_TAG, _line);
\r
402 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
\r
405 // The next character must be a space:
\r
410 _skipWhitespace(p);
\r
422 Boolean XmlParser::_getOpenElementName(char*& p, Boolean& openCloseElement)
\r
424 openCloseElement = false;
\r
426 if (!isalpha(*p) && *p != '_')
\r
427 throw XmlException(XmlException::BAD_START_TAG, _line);
\r
430 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
\r
433 // The next character must be a space:
\r
438 _skipWhitespace(p);
\r
447 if (p[0] == '/' && p[1] == '>')
\r
449 openCloseElement = true;
\r
458 void XmlParser::_getAttributeNameAndEqual(char*& p)
\r
460 if (!isalpha(*p) && *p != '_')
\r
461 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
\r
464 (isalnum(*p) || *p == '_' || *p == '-' || *p == ':' || *p == '.'))
\r
469 _skipWhitespace(p);
\r
472 throw XmlException(XmlException::BAD_ATTRIBUTE_NAME, _line);
\r
476 _skipWhitespace(p);
\r
481 void XmlParser::_getAttributeValue(char*& p)
\r
483 // ATTN-B: handle values contained in semiquotes:
\r
485 if (*p != '"' && *p != '\'')
\r
486 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
\r
488 char startChar = *p++;
\r
490 while (*p && *p != startChar)
\r
493 if (*p != startChar)
\r
494 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
\r
499 void XmlParser::_getComment(char*& p)
\r
501 // Now p points to first non-whitespace character beyond "<--" sequence:
\r
505 if (p[0] == '-' && p[1] == '-')
\r
509 throw XmlException(
\r
510 XmlException::MINUS_MINUS_IN_COMMENT, _line);
\r
513 // Find end of comment (excluding whitespace):
\r
521 // If it got this far, then the comment is unterminated:
\r
523 throw XmlException(XmlException::UNTERMINATED_COMMENT, _line);
\r
526 void XmlParser::_getCData(char*& p)
\r
528 // At this point p points one past "<![CDATA[" sequence:
\r
532 if (p[0] == ']' && p[1] == ']' && p[2] == '>')
\r
538 else if (*p == '\n')
\r
542 // If it got this far, then the comment is unterminated:
\r
544 throw XmlException(XmlException::UNTERMINATED_CDATA, _line);
\r
547 void XmlParser::_getDocType(char*& p)
\r
549 // Just ignore the DOCTYPE command for now:
\r
551 for (; *p && *p != '>'; p++)
\r
558 throw XmlException(XmlException::UNTERMINATED_DOCTYPE, _line);
\r
563 void XmlParser::_getContent(char*& p)
\r
565 while (*p && *p != '<')
\r
574 void XmlParser::_substituteReferences(char* text)
\r
576 Uint32 rem = strlen(text);
\r
578 for (char* p = text; *p; p++, rem--)
\r
582 // Look for predefined entity reference:
\r
584 Boolean found = false;
\r
586 for (Uint32 i = 0; i < _REFERENCES_SIZE; i++)
\r
588 Uint32 length = _references[i].length;
\r
589 const char* match = _references[i].match;
\r
591 if (strncmp(p, _references[i].match, length) == 0)
\r
594 *p = _references[i].replacement;
\r
595 char* q = p + length;
\r
596 rem = rem - length + 1;
\r
597 memmove(p + 1, q, rem);
\r
601 // If not found, then at least make sure it is well formed:
\r
608 XmlException::Code code = XmlException::MALFORMED_REFERENCE;
\r
610 if (isalpha(*p) || *p == '_')
\r
612 for (p++; *p && *p != ';'; p++)
\r
614 if (!isalnum(*p) && *p != '_')
\r
615 throw XmlException(code, _line);
\r
618 else if (*p == '#')
\r
620 for (p++ ; *p && *p != ';'; p++)
\r
623 throw XmlException(code, _line);
\r
628 throw XmlException(code, _line);
\r
636 static const char _EMPTY_STRING[] = "";
\r
638 void XmlParser::_getElement(char*& p, XmlEntry& entry)
\r
640 entry.attributeCount = 0;
\r
642 //--------------------------------------------------------------------------
\r
643 // Get the element name (expect one of these: '?', '!', [A-Za-z_])
\r
644 //--------------------------------------------------------------------------
\r
648 entry.type = XmlEntry::XML_DECLARATION;
\r
651 Boolean openCloseElement = false;
\r
653 if (_getElementName(p))
\r
656 else if (*p == '!')
\r
660 // Expect a comment or CDATA:
\r
662 if (p[0] == '-' && p[1] == '-')
\r
665 entry.type = XmlEntry::COMMENT;
\r
670 else if (memcmp(p, "[CDATA[", 7) == 0)
\r
673 entry.type = XmlEntry::CDATA;
\r
678 else if (memcmp(p, "DOCTYPE", 7) == 0)
\r
680 entry.type = XmlEntry::DOCTYPE;
\r
681 entry.text = _EMPTY_STRING;
\r
685 throw(XmlException(XmlException::EXPECTED_COMMENT_OR_CDATA, _line));
\r
687 else if (*p == '/')
\r
689 entry.type = XmlEntry::END_TAG;
\r
692 if (!_getElementName(p))
\r
693 throw(XmlException(XmlException::BAD_END_TAG, _line));
\r
697 else if (isalpha(*p) || *p == '_')
\r
699 entry.type = XmlEntry::START_TAG;
\r
702 Boolean openCloseElement = false;
\r
704 if (_getOpenElementName(p, openCloseElement))
\r
706 if (openCloseElement)
\r
707 entry.type = XmlEntry::EMPTY_TAG;
\r
712 throw XmlException(XmlException::BAD_START_TAG, _line);
\r
714 //--------------------------------------------------------------------------
\r
715 // Grab all the attributes:
\r
716 //--------------------------------------------------------------------------
\r
720 if (entry.type == XmlEntry::XML_DECLARATION)
\r
722 if (p[0] == '?' && p[1] == '>')
\r
728 else if (entry.type == XmlEntry::START_TAG && p[0] == '/' && p[1] =='>')
\r
730 entry.type = XmlEntry::EMPTY_TAG;
\r
734 else if (*p == '>')
\r
742 _getAttributeNameAndEqual(p);
\r
744 if (*p != '"' && *p != '\'')
\r
745 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
\r
747 attr.value = p + 1;
\r
748 _getAttributeValue(p);
\r
750 if (entry.type == XmlEntry::XML_DECLARATION)
\r
752 // The next thing must a space or a "?>":
\r
754 if (!(p[0] == '?' && p[1] == '>') && !isspace(*p))
\r
756 throw XmlException(
\r
757 XmlException::BAD_ATTRIBUTE_VALUE, _line);
\r
760 else if (!(*p == '>' || (p[0] == '/' && p[1] == '>') || isspace(*p)))
\r
762 // The next thing must be a space or a '>':
\r
764 throw XmlException(XmlException::BAD_ATTRIBUTE_VALUE, _line);
\r
767 _skipWhitespace(p);
\r
769 if (entry.attributeCount == XmlEntry::MAX_ATTRIBUTES)
\r
770 throw XmlException(XmlException::TOO_MANY_ATTRIBUTES, _line);
\r
772 _substituteReferences((char*)attr.value);
\r
773 entry.attributes[entry.attributeCount++] = attr;
\r
777 static const char* _typeStrings[] =
\r
779 "XML_DECLARATION",
\r
789 void XmlEntry::print() const
\r
791 PEGASUS_STD(cout) << "=== " << _typeStrings[type] << " ";
\r
793 Boolean needQuotes = type == XmlEntry::CDATA || type == XmlEntry::CONTENT;
\r
796 PEGASUS_STD(cout) << "\"";
\r
801 PEGASUS_STD(cout) << "\"";
\r
803 PEGASUS_STD(cout) << '\n';
\r
805 for (Uint32 i = 0; i < attributeCount; i++)
\r
807 PEGASUS_STD(cout) << " " << attributes[i].name << "=\"";
\r
808 _printValue(attributes[i].value);
\r
809 PEGASUS_STD(cout) << "\"" << PEGASUS_STD(endl);
\r
813 const XmlAttribute* XmlEntry::findAttribute(
\r
814 const char* name) const
\r
816 for (Uint32 i = 0; i < attributeCount; i++)
\r
818 if (strcmp(attributes[i].name, name) == 0)
\r
819 return &attributes[i];
\r
825 // Find first non-whitespace character (set first) and last non-whitespace
\r
826 // character (set last one past this). For example, consider this string:
\r
830 // The first pointer would point to '8' and the last pointer woudl point one
\r
833 static void _findEnds(
\r
835 const char*& first,
\r
840 while (isspace(*first))
\r
849 last = first + strlen(first);
\r
851 while (last != first && isspace(last[-1]))
\r
855 Boolean XmlEntry::getAttributeValue(
\r
857 Uint32& value) const
\r
859 const XmlAttribute* attr = findAttribute(name);
\r
866 _findEnds(attr->value, first, last);
\r
869 long tmp = strtol(first, &end, 10);
\r
871 if (!end || end != last)
\r
874 value = Uint32(tmp);
\r
878 Boolean XmlEntry::getAttributeValue(
\r
880 Real32& value) const
\r
882 const XmlAttribute* attr = findAttribute(name);
\r
889 _findEnds(attr->value, first, last);
\r
892 double tmp = strtod(first, &end);
\r
894 if (!end || end != last)
\r
897 value = Uint32(tmp);
\r
901 Boolean XmlEntry::getAttributeValue(
\r
903 const char*& value) const
\r
905 const XmlAttribute* attr = findAttribute(name);
\r
910 value = attr->value;
\r
914 Boolean XmlEntry::getAttributeValue(const char* name, String& value) const
\r
918 if (!getAttributeValue(name, tmp))
\r
925 void XmlAppendCString(Array<Sint8>& out, const char* str)
\r
927 out.append(str, strlen(str));
\r
930 PEGASUS_NAMESPACE_END
\r