1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 """
43 Provides general XML-related functionality.
44
45 What I'm trying to do here is abstract much of the functionality that directly
46 accesses the DOM tree. This is not so much to "protect" the other code from
47 the DOM, but to standardize the way it's used. It will also help extension
48 authors write code that easily looks more like the rest of Cedar Backup.
49
50 @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren,
51 readFirstChild, readStringList, readString, readInteger, readBoolean,
52 addContainerNode, addStringNode, addIntegerNode, addBooleanNode,
53 TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES
54
55 @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}.
56 @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}.
57 @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML.
58
59 @author: Kenneth J. Pronovici <pronovic@ieee.org>
60 """
61
62
63
64
65
66
67 import sys
68 import re
69 import logging
70 import codecs
71 from types import UnicodeType
72 from StringIO import StringIO
73
74
75 from xml.parsers.expat import ExpatError
76 from xml.dom.minidom import Node
77 from xml.dom.minidom import getDOMImplementation
78 from xml.dom.minidom import parseString
79
80
81
82
83
84
85 logger = logging.getLogger("CedarBackup2.log.xml")
86
87 TRUE_BOOLEAN_VALUES = [ "Y", "y", ]
88 FALSE_BOOLEAN_VALUES = [ "N", "n", ]
89 VALID_BOOLEAN_VALUES = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES
90
91
92
93
94
95
109
111 """
112 Creates a DOM tree used for writing an XML document.
113 @param name: Base name of the document (root node name).
114 @return: Tuple (xmlDom, parentNode) for the new document
115 """
116 impl = getDOMImplementation()
117 xmlDom = impl.createDocument(None, name, None)
118 return (xmlDom, xmlDom.documentElement)
119
120
121
122
123
124
126 """
127 Returns True or False depending on whether the XML node is an element node.
128 """
129 return node.nodeType == Node.ELEMENT_NODE
130
132 """
133 Returns a list of nodes with a given name immediately beneath the
134 parent.
135
136 By "immediately beneath" the parent, we mean from among nodes that are
137 direct children of the passed-in parent node.
138
139 Underneath, we use the Python C{getElementsByTagName} method, which is
140 pretty cool, but which (surprisingly?) returns a list of all children
141 with a given name below the parent, at any level. We just prune that
142 list to include only children whose C{parentNode} matches the passed-in
143 parent.
144
145 @param parent: Parent node to search beneath.
146 @param name: Name of nodes to search for.
147
148 @return: List of child nodes with correct parent, or an empty list if
149 no matching nodes are found.
150 """
151 lst = []
152 if parent is not None:
153 result = parent.getElementsByTagName(name)
154 for entry in result:
155 if entry.parentNode is parent:
156 lst.append(entry)
157 return lst
158
160 """
161 Returns the first child with a given name immediately beneath the parent.
162
163 By "immediately beneath" the parent, we mean from among nodes that are
164 direct children of the passed-in parent node.
165
166 @param parent: Parent node to search beneath.
167 @param name: Name of node to search for.
168
169 @return: First properly-named child of parent, or C{None} if no matching nodes are found.
170 """
171 result = readChildren(parent, name)
172 if result is None or result == []:
173 return None
174 return result[0]
175
177 """
178 Returns a list of the string contents associated with nodes with a given
179 name immediately beneath the parent.
180
181 By "immediately beneath" the parent, we mean from among nodes that are
182 direct children of the passed-in parent node.
183
184 First, we find all of the nodes using L{readChildren}, and then we
185 retrieve the "string contents" of each of those nodes. The returned list
186 has one entry per matching node. We assume that string contents of a
187 given node belong to the first C{TEXT_NODE} child of that node. Nodes
188 which have no C{TEXT_NODE} children are not represented in the returned
189 list.
190
191 @param parent: Parent node to search beneath.
192 @param name: Name of node to search for.
193
194 @return: List of strings as described above, or C{None} if no matching nodes are found.
195 """
196 lst = []
197 result = readChildren(parent, name)
198 for entry in result:
199 if entry.hasChildNodes():
200 for child in entry.childNodes:
201 if child.nodeType == Node.TEXT_NODE:
202 lst.append(child.nodeValue)
203 break
204 if lst == []:
205 lst = None
206 return lst
207
209 """
210 Returns string contents of the first child with a given name immediately
211 beneath the parent.
212
213 By "immediately beneath" the parent, we mean from among nodes that are
214 direct children of the passed-in parent node. We assume that string
215 contents of a given node belong to the first C{TEXT_NODE} child of that
216 node.
217
218 @param parent: Parent node to search beneath.
219 @param name: Name of node to search for.
220
221 @return: String contents of node or C{None} if no matching nodes are found.
222 """
223 result = readStringList(parent, name)
224 if result is None:
225 return None
226 return result[0]
227
229 """
230 Returns integer contents of the first child with a given name immediately
231 beneath the parent.
232
233 By "immediately beneath" the parent, we mean from among nodes that are
234 direct children of the passed-in parent node.
235
236 @param parent: Parent node to search beneath.
237 @param name: Name of node to search for.
238
239 @return: Integer contents of node or C{None} if no matching nodes are found.
240 @raise ValueError: If the string at the location can't be converted to an integer.
241 """
242 result = readString(parent, name)
243 if result is None:
244 return None
245 else:
246 return int(result)
247
249 """
250 Returns float contents of the first child with a given name immediately
251 beneath the parent.
252
253 By "immediately beneath" the parent, we mean from among nodes that are
254 direct children of the passed-in parent node.
255
256 @param parent: Parent node to search beneath.
257 @param name: Name of node to search for.
258
259 @return: Float contents of node or C{None} if no matching nodes are found.
260 @raise ValueError: If the string at the location can't be converted to a
261 float value.
262 """
263 result = readString(parent, name)
264 if result is None:
265 return None
266 else:
267 return float(result)
268
270 """
271 Returns boolean contents of the first child with a given name immediately
272 beneath the parent.
273
274 By "immediately beneath" the parent, we mean from among nodes that are
275 direct children of the passed-in parent node.
276
277 The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}.
278
279 @param parent: Parent node to search beneath.
280 @param name: Name of node to search for.
281
282 @return: Boolean contents of node or C{None} if no matching nodes are found.
283 @raise ValueError: If the string at the location can't be converted to a boolean.
284 """
285 result = readString(parent, name)
286 if result is None:
287 return None
288 else:
289 if result in TRUE_BOOLEAN_VALUES:
290 return True
291 elif result in FALSE_BOOLEAN_VALUES:
292 return False
293 else:
294 raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES)
295
296
297
298
299
300
302 """
303 Adds a container node as the next child of a parent node.
304
305 @param xmlDom: DOM tree as from C{impl.createDocument()}.
306 @param parentNode: Parent node to create child for.
307 @param nodeName: Name of the new container node.
308
309 @return: Reference to the newly-created node.
310 """
311 containerNode = xmlDom.createElement(nodeName)
312 parentNode.appendChild(containerNode)
313 return containerNode
314
316 """
317 Adds a text node as the next child of a parent, to contain a string.
318
319 If the C{nodeValue} is None, then the node will be created, but will be
320 empty (i.e. will contain no text node child).
321
322 @param xmlDom: DOM tree as from C{impl.createDocument()}.
323 @param parentNode: Parent node to create child for.
324 @param nodeName: Name of the new container node.
325 @param nodeValue: The value to put into the node.
326
327 @return: Reference to the newly-created node.
328 """
329 containerNode = addContainerNode(xmlDom, parentNode, nodeName)
330 if nodeValue is not None:
331 textNode = xmlDom.createTextNode(nodeValue)
332 containerNode.appendChild(textNode)
333 return containerNode
334
336 """
337 Adds a text node as the next child of a parent, to contain an integer.
338
339 If the C{nodeValue} is None, then the node will be created, but will be
340 empty (i.e. will contain no text node child).
341
342 The integer will be converted to a string using "%d". The result will be
343 added to the document via L{addStringNode}.
344
345 @param xmlDom: DOM tree as from C{impl.createDocument()}.
346 @param parentNode: Parent node to create child for.
347 @param nodeName: Name of the new container node.
348 @param nodeValue: The value to put into the node.
349
350 @return: Reference to the newly-created node.
351 """
352 if nodeValue is None:
353 return addStringNode(xmlDom, parentNode, nodeName, None)
354 else:
355 return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)
356
358 """
359 Adds a text node as the next child of a parent, to contain a boolean.
360
361 If the C{nodeValue} is None, then the node will be created, but will be
362 empty (i.e. will contain no text node child).
363
364 Boolean C{True}, or anything else interpreted as C{True} by Python, will
365 be converted to a string "Y". Anything else will be converted to a
366 string "N". The result is added to the document via L{addStringNode}.
367
368 @param xmlDom: DOM tree as from C{impl.createDocument()}.
369 @param parentNode: Parent node to create child for.
370 @param nodeName: Name of the new container node.
371 @param nodeValue: The value to put into the node.
372
373 @return: Reference to the newly-created node.
374 """
375 if nodeValue is None:
376 return addStringNode(xmlDom, parentNode, nodeName, None)
377 else:
378 if nodeValue:
379 return addStringNode(xmlDom, parentNode, nodeName, "Y")
380 else:
381 return addStringNode(xmlDom, parentNode, nodeName, "N")
382
383
384
385
386
387
389 """
390 Serializes a DOM tree and returns the result in a string.
391 @param xmlDom: XML DOM tree to serialize
392 @param indent: Number of spaces to indent, as an integer
393 @return: String form of DOM tree, pretty-printed.
394 """
395 xmlBuffer = StringIO()
396 serializer = Serializer(xmlBuffer, "UTF-8", indent=indent)
397 serializer.serialize(xmlDom)
398 xmlData = xmlBuffer.getvalue()
399 xmlBuffer.close()
400 return xmlData
401
403
404 """
405 XML serializer class.
406
407 This is a customized serializer that I hacked together based on what I found
408 in the PyXML distribution. Basically, around release 2.7.0, the only reason
409 I still had around a dependency on PyXML was for the PrettyPrint
410 functionality, and that seemed pointless. So, I stripped the PrettyPrint
411 code out of PyXML and hacked bits of it off until it did just what I needed
412 and no more.
413
414 This code started out being called PrintVisitor, but I decided it makes more
415 sense just calling it a serializer. I've made nearly all of the methods
416 private, and I've added a new high-level serialize() method rather than
417 having clients call C{visit()}.
418
419 Anyway, as a consequence of my hacking with it, this can't quite be called a
420 complete XML serializer any more. I ripped out support for HTML and XHTML,
421 and there is also no longer any support for namespaces (which I took out
422 because this dragged along a lot of extra code, and Cedar Backup doesn't use
423 namespaces). However, everything else should pretty much work as expected.
424
425 @copyright: This code, prior to customization, was part of the PyXML
426 codebase, and before that was part of the 4DOM suite developed by
427 Fourthought, Inc. It its original form, it was Copyright (c) 2000
428 Fourthought Inc, USA; All Rights Reserved.
429 """
430
431 - def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3):
432 """
433 Initialize a serializer.
434 @param stream: Stream to write output to.
435 @param encoding: Output encoding.
436 @param indent: Number of spaces to indent, as an integer
437 """
438 self.stream = stream
439 self.encoding = encoding
440 self._indent = indent * " "
441 self._depth = 0
442 self._inText = 0
443
445 """
446 Serialize the passed-in XML document.
447 @param xmlDom: XML DOM tree to serialize
448 @raise ValueError: If there's an unknown node type in the document.
449 """
450 self._visit(xmlDom)
451 self.stream.write("\n")
452
454 obj = _encodeText(text, self.encoding)
455 self.stream.write(obj)
456 return
457
459 if not self._inText and self._indent:
460 self._write('\n' + self._indent*self._depth)
461 return
462
464 """
465 @raise ValueError: If there's an unknown node type in the document.
466 """
467 if node.nodeType == Node.ELEMENT_NODE:
468 return self._visitElement(node)
469
470 elif node.nodeType == Node.ATTRIBUTE_NODE:
471 return self._visitAttr(node)
472
473 elif node.nodeType == Node.TEXT_NODE:
474 return self._visitText(node)
475
476 elif node.nodeType == Node.CDATA_SECTION_NODE:
477 return self._visitCDATASection(node)
478
479 elif node.nodeType == Node.ENTITY_REFERENCE_NODE:
480 return self._visitEntityReference(node)
481
482 elif node.nodeType == Node.ENTITY_NODE:
483 return self._visitEntity(node)
484
485 elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE:
486 return self._visitProcessingInstruction(node)
487
488 elif node.nodeType == Node.COMMENT_NODE:
489 return self._visitComment(node)
490
491 elif node.nodeType == Node.DOCUMENT_NODE:
492 return self._visitDocument(node)
493
494 elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
495 return self._visitDocumentType(node)
496
497 elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
498 return self._visitDocumentFragment(node)
499
500 elif node.nodeType == Node.NOTATION_NODE:
501 return self._visitNotation(node)
502
503
504 raise ValueError("Unknown node type: %s" % repr(node))
505
507 for curr in node:
508 curr is not exclude and self._visit(curr)
509 return
510
512 for item in node.values():
513 self._visit(item)
514 return
515
517 self._write(' ' + node.name)
518 value = node.value
519 text = _translateCDATA(value, self.encoding)
520 text, delimiter = _translateCDATAAttr(text)
521 self.stream.write("=%s%s%s" % (delimiter, text, delimiter))
522 return
523
525 self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8'))
526 self._inText = 0
527 return
528
534
538
540 self._tryIndent()
541 self._write('<%s' % node.tagName)
542 for attr in node.attributes.values():
543 self._visitAttr(attr)
544 if len(node.childNodes):
545 self._write('>')
546 self._depth = self._depth + 1
547 self._visitNodeList(node.childNodes)
548 self._depth = self._depth - 1
549 not (self._inText) and self._tryIndent()
550 self._write('</%s>' % node.tagName)
551 else:
552 self._write('/>')
553 self._inText = 0
554 return
555
556 - def _visitText(self, node):
557 text = node.data
558 if self._indent:
559 text.strip()
560 if text:
561 text = _translateCDATA(text, self.encoding)
562 self.stream.write(text)
563 self._inText = 1
564 return
565
567 if not doctype.systemId and not doctype.publicId: return
568 self._tryIndent()
569 self._write('<!DOCTYPE %s' % doctype.name)
570 if doctype.systemId and '"' in doctype.systemId:
571 system = "'%s'" % doctype.systemId
572 else:
573 system = '"%s"' % doctype.systemId
574 if doctype.publicId and '"' in doctype.publicId:
575
576
577
578 public = "'%s'" % doctype.publicId
579 else:
580 public = '"%s"' % doctype.publicId
581 if doctype.publicId and doctype.systemId:
582 self._write(' PUBLIC %s %s' % (public, system))
583 elif doctype.systemId:
584 self._write(' SYSTEM %s' % system)
585 if doctype.entities or doctype.notations:
586 self._write(' [')
587 self._depth = self._depth + 1
588 self._visitNamedNodeMap(doctype.entities)
589 self._visitNamedNodeMap(doctype.notations)
590 self._depth = self._depth - 1
591 self._tryIndent()
592 self._write(']>')
593 else:
594 self._write('>')
595 self._inText = 0
596 return
597
599 """Visited from a NamedNodeMap in DocumentType"""
600 self._tryIndent()
601 self._write('<!ENTITY %s' % (node.nodeName))
602 node.publicId and self._write(' PUBLIC %s' % node.publicId)
603 node.systemId and self._write(' SYSTEM %s' % node.systemId)
604 node.notationName and self._write(' NDATA %s' % node.notationName)
605 self._write('>')
606 return
607
609 """Visited from a NamedNodeMap in DocumentType"""
610 self._tryIndent()
611 self._write('<!NOTATION %s' % node.nodeName)
612 node.publicId and self._write(' PUBLIC %s' % node.publicId)
613 node.systemId and self._write(' SYSTEM %s' % node.systemId)
614 self._write('>')
615 return
616
618 self._tryIndent()
619 self._write('<![CDATA[%s]]>' % (node.data))
620 self._inText = 0
621 return
622
628
630 self._write('&%s;' % node.nodeName)
631 self._inText = 1
632 return
633
635 self._tryIndent()
636 self._write('<?%s %s?>' % (node.target, node.data))
637 self._inText = 0
638 return
639
640 -def _encodeText(text, encoding):
641 """
642 @copyright: This code, prior to customization, was part of the PyXML
643 codebase, and before that was part of the 4DOM suite developed by
644 Fourthought, Inc. It its original form, it was attributed to Martin v.
645 Löwis and was Copyright (c) 2000 Fourthought Inc, USA; All Rights Reserved.
646 """
647 encoder = codecs.lookup(encoding)[0]
648 if type(text) is not UnicodeType:
649 text = unicode(text, "utf-8")
650 return encoder(text)[0]
651
653 """
654 Handles normalization and some intelligence about quoting.
655
656 @copyright: This code, prior to customization, was part of the PyXML
657 codebase, and before that was part of the 4DOM suite developed by
658 Fourthought, Inc. It its original form, it was Copyright (c) 2000
659 Fourthought Inc, USA; All Rights Reserved.
660 """
661 if not characters:
662 return '', "'"
663 if "'" in characters:
664 delimiter = '"'
665 new_chars = re.sub('"', '"', characters)
666 else:
667 delimiter = "'"
668 new_chars = re.sub("'", ''', characters)
669
670
671
672 if "\n" in characters:
673 new_chars = re.sub('\n', ' ', new_chars)
674 return new_chars, delimiter
675
676
677 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0):
678 """
679 @copyright: This code, prior to customization, was part of the PyXML
680 codebase, and before that was part of the 4DOM suite developed by
681 Fourthought, Inc. It its original form, it was Copyright (c) 2000
682 Fourthought Inc, USA; All Rights Reserved.
683 """
684 CDATA_CHAR_PATTERN = re.compile('[&<]|]]>')
685 CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', }
686 ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]'
687 ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]'
688 XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS))
689 if not characters:
690 return ''
691 if not markupSafe:
692 if CDATA_CHAR_PATTERN.search(characters):
693 new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0]
694 else:
695 new_string = characters
696 if prev_chars[-2:] == ']]' and characters[0] == '>':
697 new_string = '>' + new_string[1:]
698 else:
699 new_string = characters
700
701
702
703 if XML_ILLEGAL_CHAR_PATTERN.search(new_string):
704 new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0]
705 new_string = _encodeText(new_string, encoding)
706 return new_string
707