|
1 ## simplexml.py based on Mattew Allum's xmlstream.py |
|
2 ## |
|
3 ## Copyright (C) 2003-2005 Alexey "Snake" Nezhdanov |
|
4 ## |
|
5 ## This program is free software; you can redistribute it and/or modify |
|
6 ## it under the terms of the GNU General Public License as published by |
|
7 ## the Free Software Foundation; either version 2, or (at your option) |
|
8 ## any later version. |
|
9 ## |
|
10 ## This program is distributed in the hope that it will be useful, |
|
11 ## but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13 ## GNU General Public License for more details. |
|
14 |
|
15 # $Id: simplexml.py,v 1.30 2006/06/03 12:22:34 normanr Exp $ |
|
16 |
|
17 """Simplexml module provides xmpppy library with all needed tools to handle XML nodes and XML streams. |
|
18 I'm personally using it in many other separate projects. It is designed to be as standalone as possible.""" |
|
19 |
|
20 import xml.parsers.expat |
|
21 |
|
22 def XMLescape(txt): |
|
23 """Returns provided string with symbols & < > " replaced by their respective XML entities.""" |
|
24 return txt.replace("&", "&").replace("<", "<").replace(">", ">").replace('"', """) |
|
25 |
|
26 ENCODING='utf-8' |
|
27 def ustr(what): |
|
28 """Converts object "what" to unicode string using it's own __str__ method if accessible or unicode method otherwise.""" |
|
29 if type(what) == type(u''): return what |
|
30 try: r=what.__str__() |
|
31 except AttributeError: r=str(what) |
|
32 if type(r)<>type(u''): return unicode(r,ENCODING) |
|
33 return r |
|
34 |
|
35 class Node: |
|
36 """ Node class describes syntax of separate XML Node. It have a constructor that permits node creation |
|
37 from set of "namespace name", attributes and payload of text strings and other nodes. |
|
38 It does not natively support building node from text string and uses NodeBuilder class for that purpose. |
|
39 After creation node can be mangled in many ways so it can be completely changed. |
|
40 Also node can be serialised into string in one of two modes: default (where the textual representation |
|
41 of node describes it exactly) and "fancy" - with whitespace added to make indentation and thus make |
|
42 result more readable by human. |
|
43 |
|
44 Node class have attribute FORCE_NODE_RECREATION that is defaults to False thus enabling fast node |
|
45 replication from the some other node. The drawback of the fast way is that new node shares some |
|
46 info with the "original" node that is changing the one node may influence the other. Though it is |
|
47 rarely needed (in xmpppy it is never needed at all since I'm usually never using original node after |
|
48 replication (and using replication only to move upwards on the classes tree). |
|
49 """ |
|
50 FORCE_NODE_RECREATION=0 |
|
51 def __init__(self, tag=None, attrs={}, payload=[], parent=None, node=None): |
|
52 """ Takes "tag" argument as the name of node (prepended by namespace, if needed and separated from it |
|
53 by a space), attrs dictionary as the set of arguments, payload list as the set of textual strings |
|
54 and child nodes that this node carries within itself and "parent" argument that is another node |
|
55 that this one will be the child of. Also the __init__ can be provided with "node" argument that is |
|
56 either a text string containing exactly one node or another Node instance to begin with. If both |
|
57 "node" and other arguments is provided then the node initially created as replica of "node" |
|
58 provided and then modified to be compliant with other arguments.""" |
|
59 if node: |
|
60 if self.FORCE_NODE_RECREATION and type(node)==type(self): node=str(node) |
|
61 if type(node)<>type(self): node=NodeBuilder(node,self) |
|
62 else: |
|
63 self.name,self.namespace,self.attrs,self.data,self.kids,self.parent = node.name,node.namespace,{},[],[],node.parent |
|
64 for key in node.attrs.keys(): self.attrs[key]=node.attrs[key] |
|
65 for data in node.data: self.data.append(data) |
|
66 for kid in node.kids: self.kids.append(kid) |
|
67 else: self.name,self.namespace,self.attrs,self.data,self.kids,self.parent = 'tag','',{},[],[],None |
|
68 |
|
69 if tag: self.namespace, self.name = ([self.namespace]+tag.split())[-2:] |
|
70 if parent: self.parent = parent |
|
71 if self.parent and not self.namespace: self.namespace=self.parent.namespace |
|
72 for attr in attrs.keys(): |
|
73 self.attrs[attr]=attrs[attr] |
|
74 if type(payload) in (type(''),type(u'')): payload=[payload] |
|
75 for i in payload: |
|
76 if type(i)==type(self): self.addChild(node=i) |
|
77 else: self.data.append(ustr(i)) |
|
78 |
|
79 def __str__(self,fancy=0): |
|
80 """ Method used to dump node into textual representation. |
|
81 if "fancy" argument is set to True produces indented output for readability.""" |
|
82 s = (fancy-1) * 2 * ' ' + "<" + self.name |
|
83 if self.namespace: |
|
84 if not self.parent or self.parent.namespace!=self.namespace: |
|
85 s = s + ' xmlns="%s"'%self.namespace |
|
86 for key in self.attrs.keys(): |
|
87 val = ustr(self.attrs[key]) |
|
88 s = s + ' %s="%s"' % ( key, XMLescape(val) ) |
|
89 s = s + ">" |
|
90 cnt = 0 |
|
91 if self.kids: |
|
92 if fancy: s = s + "\n" |
|
93 for a in self.kids: |
|
94 if not fancy and (len(self.data)-1)>=cnt: s=s+XMLescape(self.data[cnt]) |
|
95 elif (len(self.data)-1)>=cnt: s=s+XMLescape(self.data[cnt].strip()) |
|
96 s = s + a.__str__(fancy and fancy+1) |
|
97 cnt=cnt+1 |
|
98 if not fancy and (len(self.data)-1) >= cnt: s = s + XMLescape(self.data[cnt]) |
|
99 elif (len(self.data)-1) >= cnt: s = s + XMLescape(self.data[cnt].strip()) |
|
100 if not self.kids and s[-1:]=='>': |
|
101 s=s[:-1]+' />' |
|
102 if fancy: s = s + "\n" |
|
103 else: |
|
104 if fancy and not self.data: s = s + (fancy-1) * 2 * ' ' |
|
105 s = s + "</" + self.name + ">" |
|
106 if fancy: s = s + "\n" |
|
107 return s |
|
108 def getCDATA(self): |
|
109 """ Serialise node, dropping all tags and leaving CDATA intact. |
|
110 That is effectively kills all formatiing, leaving only text were contained in XML. |
|
111 """ |
|
112 s ="" |
|
113 cnt = 0 |
|
114 if self.kids: |
|
115 for a in self.kids: |
|
116 s=s+self.data[cnt] |
|
117 s = s + mystr(a) |
|
118 cnt=cnt+1 |
|
119 if (len(self.data)-1) >= cnt: s = s + self.data[cnt] |
|
120 return s |
|
121 def addChild(self, name=None, attrs={}, payload=[], namespace=None, node=None): |
|
122 """ If "node" argument is provided, adds it as child node. Else creates new node from |
|
123 the other arguments' values and adds it as well.""" |
|
124 if attrs.has_key('xmlns'): |
|
125 raise AttributeError("Use namespace=x instead of attrs={'xmlns':x}") |
|
126 if namespace: name=namespace+' '+name |
|
127 if node: |
|
128 newnode=node |
|
129 node.parent = self |
|
130 else: newnode=Node(tag=name, parent=self, attrs=attrs, payload=payload) |
|
131 self.kids.append(newnode) |
|
132 return newnode |
|
133 def addData(self, data): |
|
134 """ Adds some CDATA to node. """ |
|
135 self.data.append(ustr(data)) |
|
136 def clearData(self): |
|
137 """ Removes all CDATA from the node. """ |
|
138 self.data=[] |
|
139 def delAttr(self, key): |
|
140 """ Deletes an attribute "key" """ |
|
141 del self.attrs[key] |
|
142 def delChild(self, node, attrs={}): |
|
143 """ Deletes the "node" from the node's childs list, if "node" is an instance. |
|
144 Else deletes the first node that have specified name and (optionally) attributes. """ |
|
145 if type(node)<>type(self): node=self.getTag(node,attrs) |
|
146 self.kids.remove(node) |
|
147 return node |
|
148 def getAttrs(self): |
|
149 """ Returns all node's attributes as dictionary. """ |
|
150 return self.attrs |
|
151 def getAttr(self, key): |
|
152 """ Returns value of specified attribute. """ |
|
153 try: return self.attrs[key] |
|
154 except: return None |
|
155 def getChildren(self): |
|
156 """ Returns all node's child nodes as list. """ |
|
157 return self.kids |
|
158 def getData(self): |
|
159 """ Returns all node CDATA as string (concatenated). """ |
|
160 return ''.join(self.data) |
|
161 def getName(self): |
|
162 """ Returns the name of node """ |
|
163 return self.name |
|
164 def getNamespace(self): |
|
165 """ Returns the namespace of node """ |
|
166 return self.namespace |
|
167 def getParent(self): |
|
168 """ Returns the parent of node (if present). """ |
|
169 return self.parent |
|
170 def getPayload(self): |
|
171 """ Return the payload of node i.e. list of child nodes and CDATA entries. |
|
172 F.e. for "<node>text1<nodea/><nodeb/> text2</node>" will be returned list: |
|
173 ['text1', <nodea instance>, <nodeb instance>, ' text2']. """ |
|
174 ret=[] |
|
175 for i in range(len(self.kids)+len(self.data)+1): |
|
176 try: |
|
177 if self.data[i]: ret.append(self.data[i]) |
|
178 except IndexError: pass |
|
179 try: ret.append(self.kids[i]) |
|
180 except IndexError: pass |
|
181 return ret |
|
182 def getTag(self, name, attrs={}, namespace=None): |
|
183 """ Filters all child nodes using specified arguments as filter. |
|
184 Returns the first found or None if not found. """ |
|
185 return self.getTags(name, attrs, namespace, one=1) |
|
186 def getTagAttr(self,tag,attr): |
|
187 """ Returns attribute value of the child with specified name (or None if no such attribute).""" |
|
188 try: return self.getTag(tag).attrs[attr] |
|
189 except: return None |
|
190 def getTagData(self,tag): |
|
191 """ Returns cocatenated CDATA of the child with specified name.""" |
|
192 try: return self.getTag(tag).getData() |
|
193 except: return None |
|
194 def getTags(self, name, attrs={}, namespace=None, one=0): |
|
195 """ Filters all child nodes using specified arguments as filter. |
|
196 Returns the list of nodes found. """ |
|
197 nodes=[] |
|
198 for node in self.kids: |
|
199 if namespace and namespace<>node.getNamespace(): continue |
|
200 if node.getName() == name: |
|
201 for key in attrs.keys(): |
|
202 if not node.attrs.has_key(key) or node.attrs[key]<>attrs[key]: break |
|
203 else: nodes.append(node) |
|
204 if one and nodes: return nodes[0] |
|
205 if not one: return nodes |
|
206 def setAttr(self, key, val): |
|
207 """ Sets attribute "key" with the value "val". """ |
|
208 self.attrs[key]=val |
|
209 def setData(self, data): |
|
210 """ Sets node's CDATA to provided string. Resets all previous CDATA!""" |
|
211 self.data=[ustr(data)] |
|
212 def setName(self,val): |
|
213 """ Changes the node name. """ |
|
214 self.name = val |
|
215 def setNamespace(self, namespace): |
|
216 """ Changes the node namespace. """ |
|
217 self.namespace=namespace |
|
218 def setParent(self, node): |
|
219 """ Sets node's parent to "node". WARNING: do not checks if the parent already present |
|
220 and not removes the node from the list of childs of previous parent. """ |
|
221 self.parent = node |
|
222 def setPayload(self,payload,add=0): |
|
223 """ Sets node payload according to the list specified. WARNING: completely replaces all node's |
|
224 previous content. If you wish just to add child or CDATA - use addData or addChild methods. """ |
|
225 if type(payload) in (type(''),type(u'')): payload=[payload] |
|
226 if add: self.kids+=payload |
|
227 else: self.kids=payload |
|
228 def setTag(self, name, attrs={}, namespace=None): |
|
229 """ Same as getTag but if the node with specified namespace/attributes not found, creates such |
|
230 node and returns it. """ |
|
231 node=self.getTags(name, attrs, namespace=namespace, one=1) |
|
232 if node: return node |
|
233 else: return self.addChild(name, attrs, namespace=namespace) |
|
234 def setTagAttr(self,tag,attr,val): |
|
235 """ Creates new node (if not already present) with name "tag" |
|
236 and sets it's attribute "attr" to value "val". """ |
|
237 try: self.getTag(tag).attrs[attr]=val |
|
238 except: self.addChild(tag,attrs={attr:val}) |
|
239 def setTagData(self,tag,val,attrs={}): |
|
240 """ Creates new node (if not already present) with name "tag" and (optionally) attributes "attrs" |
|
241 and sets it's CDATA to string "val". """ |
|
242 try: self.getTag(tag,attrs).setData(ustr(val)) |
|
243 except: self.addChild(tag,attrs,payload=[ustr(val)]) |
|
244 def has_attr(self,key): |
|
245 """ Checks if node have attribute "key".""" |
|
246 return self.attrs.has_key(key) |
|
247 def __getitem__(self,item): |
|
248 """ Returns node's attribute "item" value. """ |
|
249 return self.getAttr(item) |
|
250 def __setitem__(self,item,val): |
|
251 """ Sets node's attribute "item" value. """ |
|
252 return self.setAttr(item,val) |
|
253 def __delitem__(self,item): |
|
254 """ Deletes node's attribute "item". """ |
|
255 return self.delAttr(item,val) |
|
256 def __getattr__(self,attr): |
|
257 """ Reduce memory usage caused by T/NT classes - use memory only when needed. """ |
|
258 if attr=='T': |
|
259 self.T=T(self) |
|
260 return self.T |
|
261 if attr=='NT': |
|
262 self.NT=NT(self) |
|
263 return self.NT |
|
264 raise AttributeError |
|
265 |
|
266 class T: |
|
267 """ Auxiliary class used to quick access to node's child nodes. """ |
|
268 def __init__(self,node): self.__dict__['node']=node |
|
269 def __getattr__(self,attr): return self.node.getTag(attr) |
|
270 def __setattr__(self,attr,val): |
|
271 if isinstance(val,Node): Node.__init__(self.node.setTag(attr),node=val) |
|
272 else: return self.node.setTagData(attr,val) |
|
273 def __delattr__(self,attr): return self.node.delChild(attr) |
|
274 |
|
275 class NT(T): |
|
276 """ Auxiliary class used to quick create node's child nodes. """ |
|
277 def __getattr__(self,attr): return self.node.addChild(attr) |
|
278 def __setattr__(self,attr,val): |
|
279 if isinstance(val,Node): self.node.addChild(attr,node=val) |
|
280 else: return self.node.addChild(attr,payload=[val]) |
|
281 |
|
282 DBG_NODEBUILDER = 'nodebuilder' |
|
283 class NodeBuilder: |
|
284 """ Builds a Node class minidom from data parsed to it. This class used for two purposes: |
|
285 1. Creation an XML Node from a textual representation. F.e. reading a config file. See an XML2Node method. |
|
286 2. Handling an incoming XML stream. This is done by mangling |
|
287 the __dispatch_depth parameter and redefining the dispatch method. |
|
288 You do not need to use this class directly if you do not designing your own XML handler.""" |
|
289 def __init__(self,data=None,initial_node=None): |
|
290 """ Takes two optional parameters: "data" and "initial_node". |
|
291 By default class initialised with empty Node class instance. |
|
292 Though, if "initial_node" is provided it used as "starting point". |
|
293 You can think about it as of "node upgrade". |
|
294 "data" (if provided) feeded to parser immidiatedly after instance init. |
|
295 """ |
|
296 self.DEBUG(DBG_NODEBUILDER, "Preparing to handle incoming XML stream.", 'start') |
|
297 self._parser = xml.parsers.expat.ParserCreate(namespace_separator=' ') |
|
298 self._parser.StartElementHandler = self.starttag |
|
299 self._parser.EndElementHandler = self.endtag |
|
300 self._parser.CharacterDataHandler = self.handle_data |
|
301 self._parser.StartNamespaceDeclHandler = self.handle_namespace_start |
|
302 self.Parse = self._parser.Parse |
|
303 |
|
304 self.__depth = 0 |
|
305 self._dispatch_depth = 1 |
|
306 self._document_attrs = None |
|
307 self._mini_dom=initial_node |
|
308 self.last_is_data = 1 |
|
309 self._ptr=None |
|
310 self.namespaces={"http://www.w3.org/XML/1998/namespace":'xml:'} |
|
311 self.xmlns="http://www.w3.org/XML/1998/namespace" |
|
312 |
|
313 if data: self._parser.Parse(data,1) |
|
314 |
|
315 def destroy(self): |
|
316 """ Method used to allow class instance to be garbage-collected. """ |
|
317 self._parser.StartElementHandler = None |
|
318 self._parser.EndElementHandler = None |
|
319 self._parser.CharacterDataHandler = None |
|
320 self._parser.StartNamespaceDeclHandler = None |
|
321 |
|
322 def starttag(self, tag, attrs): |
|
323 """XML Parser callback. Used internally""" |
|
324 attlist=attrs.keys() # |
|
325 for attr in attlist: # FIXME: Crude hack. And it also slows down the whole library considerably. |
|
326 sp=attr.rfind(" ") # |
|
327 if sp==-1: continue # |
|
328 ns=attr[:sp] # |
|
329 attrs[self.namespaces[ns]+attr[sp+1:]]=attrs[attr] |
|
330 del attrs[attr] # |
|
331 self.__depth += 1 |
|
332 self.DEBUG(DBG_NODEBUILDER, "DEPTH -> %i , tag -> %s, attrs -> %s" % (self.__depth, tag, `attrs`), 'down') |
|
333 if self.__depth == self._dispatch_depth: |
|
334 if not self._mini_dom : self._mini_dom = Node(tag=tag, attrs=attrs) |
|
335 else: Node.__init__(self._mini_dom,tag=tag, attrs=attrs) |
|
336 self._ptr = self._mini_dom |
|
337 elif self.__depth > self._dispatch_depth: |
|
338 self._ptr.kids.append(Node(tag=tag,parent=self._ptr,attrs=attrs)) |
|
339 self._ptr = self._ptr.kids[-1] |
|
340 if self.__depth == 1: |
|
341 self._document_attrs = attrs |
|
342 ns, name = (['']+tag.split())[-2:] |
|
343 self.stream_header_received(ns, name, attrs) |
|
344 if not self.last_is_data and self._ptr.parent: self._ptr.parent.data.append('') |
|
345 self.last_is_data = 0 |
|
346 |
|
347 def endtag(self, tag ): |
|
348 """XML Parser callback. Used internally""" |
|
349 self.DEBUG(DBG_NODEBUILDER, "DEPTH -> %i , tag -> %s" % (self.__depth, tag), 'up') |
|
350 if self.__depth == self._dispatch_depth: |
|
351 self.dispatch(self._mini_dom) |
|
352 elif self.__depth > self._dispatch_depth: |
|
353 self._ptr = self._ptr.parent |
|
354 else: |
|
355 self.DEBUG(DBG_NODEBUILDER, "Got higher than dispatch level. Stream terminated?", 'stop') |
|
356 self.__depth -= 1 |
|
357 self.last_is_data = 0 |
|
358 if self.__depth == 0: self.stream_footer_received() |
|
359 |
|
360 def handle_data(self, data): |
|
361 """XML Parser callback. Used internally""" |
|
362 self.DEBUG(DBG_NODEBUILDER, data, 'data') |
|
363 if not self._ptr: return |
|
364 if self.last_is_data: |
|
365 self._ptr.data[-1] += data |
|
366 else: |
|
367 self._ptr.data.append(data) |
|
368 self.last_is_data = 1 |
|
369 |
|
370 def handle_namespace_start(self, prefix, uri): |
|
371 """XML Parser callback. Used internally""" |
|
372 if prefix: self.namespaces[uri]=prefix+':' |
|
373 else: self.xmlns=uri |
|
374 def DEBUG(self, level, text, comment=None): |
|
375 """ Gets all NodeBuilder walking events. Can be used for debugging if redefined.""" |
|
376 def getDom(self): |
|
377 """ Returns just built Node. """ |
|
378 return self._mini_dom |
|
379 def dispatch(self,stanza): |
|
380 """ Gets called when the NodeBuilder reaches some level of depth on it's way up with the built |
|
381 node as argument. Can be redefined to convert incoming XML stanzas to program events. """ |
|
382 def stream_header_received(self,ns,tag,attrs): |
|
383 """ Method called when stream just opened. """ |
|
384 def stream_footer_received(self): |
|
385 """ Method called when stream just closed. """ |
|
386 |
|
387 def XML2Node(xml): |
|
388 """ Converts supplied textual string into XML node. Handy f.e. for reading configuration file. |
|
389 Raises xml.parser.expat.parsererror if provided string is not well-formed XML. """ |
|
390 return NodeBuilder(xml).getDom() |
|
391 |
|
392 def BadXML2Node(xml): |
|
393 """ Converts supplied textual string into XML node. Survives if xml data is cutted half way round. |
|
394 I.e. "<html>some text <br>some more text". Will raise xml.parser.expat.parsererror on misplaced |
|
395 tags though. F.e. "<b>some text <br>some more text</b>" will not work.""" |
|
396 return NodeBuilder(xml).getDom() |