1
2
3
4
5
6
7 try:
8 set
9 except:
10 import sets
11 set = sets.Set
13
14 """Cerealizer -- A secure Pickle-like module
15
16 The interface of the Cerealizer module is similar to Pickle, and it supports
17 __getstate__, __setstate__, __getinitargs__ and __getnewargs__.
18
19 Cerealizer supports int, long, float, bool, complex, string, unicode, tuple, list, set, frozenset,
20 dict, old-style and new-style class instances. C-defined types are supported but saving the C-side
21 data may require to write e.g. a specific Handler or a __getstate__ and __setstate__ pair.
22 Objects with __slots__ are supported too.
23
24 You have to register the class you want to serialize, by calling cerealizer.register(YourClass).
25 Cerealizer can be considered as secure AS LONG AS the following methods of 'YourClass' are secure:
26 - __new__
27 - __del__
28 - __getstate__
29 - __setstate__
30 - __init__ (ONLY if __getinitargs__ is used for the class)
31
32 These methods are the only one Cerealizer may call. For a higher security, Cerealizer maintains
33 its own reference to these method (exepted __del__ that can only be called indirectly).
34
35 Cerealizer doesn't aim at producing Human-readable files. About performances, Cerealizer is
36 really fast and, when powered by Psyco, it may even beat cPickle! Although Cerealizer is
37 implemented in less than 500 lines of pure-Python code (which is another reason for Cerealizer
38 to be secure, since less code means less bugs :-).
39
40 Compared to Pickle (cPickle):
41 - Cerealizer is secure
42 - Cerealizer achieves similar performances (using Psyco)
43 - Cerealizer requires you to declare the serializable classes
44
45 Compared to Jelly (from TwistedMatrix):
46 - Cerealizer is faster
47 - Cerealizer does a better job with object cycles, C-defined types and tuples (*)
48 - Cerealizer files are not Human readable
49
50 (*) Jelly handles them, but tuples and objects in a cycle are first created as _Tuple or
51 _Dereference objects; this works for Python classes, but not with C-defined types which
52 expects a precise type (e.g. tuple and not _Tuple).
53
54
55
56 IMPLEMENTATION DETAILS
57
58 GENERAL FILE FORMAT STRUCTURE
59
60 Cerealizer format is simple but quite surprising. It uses a "double flat list" format.
61 It looks like that :
62
63 <magic code (currently cereal1)>\\n
64 <number of objects>\\n
65 <classname of object #0>\\n
66 <optional data for creating object #0 (currently nothing except for tuples)>
67 <classname of object #0>\\n
68 <optional data for creating object #0 (currently nothing except for tuples)>
69 [...]
70 <data of object #0 (format depend of the type of object #1)>
71 <data of object #0 (format depend of the type of object #1)>
72 [...]
73 <reference to the 'root' object>
74
75 As you can see, the information for a given object is splitted in two parts, the first one
76 for object's class, and the second one for the object's data.
77
78 To avoid problems, the order of the objects is the following:
79
80 <list, dict, set>
81 <object, instance>
82 <tuple, sorted by depth (=max number of folded tuples)>
83
84 Objects are put after basic types (list,...), since object's __setstate__ might rely on
85 a list, and thus the list must be fully loaded BEFORE calling the object's __setstate__.
86
87
88 DATA (<data of object #n> above)
89
90 The part <data of object #n> saves the data of object #n. It may contains reference to other data
91 (see below, in Cerealizer references include reference to other objects but also raw data like int).
92
93 - an object is saved by : <reference to the object state (the value returned by object.__getstate__() or object.__dict__)>
94 e.g. 'r7\\n' (object #7 being e.g. the __dict__).
95
96 - a list or a set is saved by : <number of item>\\n
97 <reference to item #0>
98 <reference to item #1>
99 [...]
100 e.g. '3\\ni0\\ni1\\ni2\\n' for [0, 1, 2]
101
102 - a dict is saved by : <number of item>\\n
103 <reference to value #0>
104 <reference to key #0>
105 <reference to value #1>
106 <reference to key #1>
107 [...]
108
109
110 REFERENCES (<reference to XXX> above)
111
112 In Cerealizer a reference can be either a reference to another object beign serialized in the
113 same file, or a raw value (e.g. an integer).
114 - an int is saved by e.g. 'i187\\n'
115 - a long is saved by e.g. 'l10000000000\\n'
116 - a float is saved by e.g. 'f1.07\\n'
117 - a bool is saved by 'b0' or 'b1'
118 - a string is saved by e.g. 's5\\nascii' (where 5 is the number of characters)
119 - an unicode is saved by e.g. 'u4\\nutf8' (where 4 is the number of characters)
120 - an object reference is saved by e.g. 'r3\\n' (where 3 means reference to object #3)
121 - None is saved by 'n'
122 """
123
124 __alls__ = ["load", "dump", "loads", "dumps", "freeze_configuration", "register"]
125 VERSION = "0.5"
126
127 import logging
128 logger = logging.getLogger("cerealizer")
129
130
131 from cStringIO import StringIO
132 from new import instance
133
136
138
140 - def dump(self, root_obj, s):
141 self.objs = []
142 self.objs_id = set()
143 self.priorities_objs = []
144 self.obj2state = {}
145 self.obj2newargs = {}
146 self.id2id = {}
147
148 self.collect(root_obj)
149 self.priorities_objs.sort(_priority_sorter)
150 self.objs.extend([o for (priority, o) in self.priorities_objs])
151
152 s.write("cereal1\n%s\n" % len(self.objs))
153
154 i = 0
155 for obj in self.objs:
156 self.id2id[id(obj)] = i
157 i += 1
158 for obj in self.objs: _HANDLERS_[obj.__class__].dump_obj (obj, self, s)
159 for obj in self.objs: _HANDLERS_[obj.__class__].dump_data(obj, self, s)
160
161 _HANDLERS_[root_obj.__class__].dump_ref(root_obj, self, s)
162
176
178 """Dumper.collect(OBJ) -> bool
179
180 Collects OBJ for serialization. Returns false is OBJ is already collected; else returns true."""
181 handler = _HANDLERS_.get(obj.__class__)
182 if not handler: raise NonCerealizableObjectError("Object of class/type '%s' cannot be cerealized! Use cerealizer.register to extend Cerealizer support to other classes." % obj.__class__)
183 handler.collect(obj, self)
184
186 """Dumper.dump_ref(OBJ, S)
187
188 Writes a reference to OBJ in file S."""
189 _HANDLERS_[obj.__class__].dump_ref(obj, self, s)
190
192 """Dumper.undump_ref(S) -> obj
193
194 Reads a reference from file S."""
195 c = s.read(1)
196 if c == "i": return int (s.readline())
197 elif c == "f": return float(s.readline())
198 elif c == "s": return s.read(int(s.readline()))
199 elif c == "u": return s.read(int(s.readline())).decode("utf8")
200 elif c == "r": return self.id2obj[int(s.readline())]
201 elif c == "n": return None
202 elif c == "b": return bool(int(s.read(1)))
203 elif c == "l": return long(s.readline())
204 elif c == "c": return complex(s.readline())
205 raise ValueError("Unknown ref code '%s'!" % c)
206
208 depth = 0
209 for i in t:
210 i2 = self.obj2newargs.get(id(i))
211 if not i2 is None: i = i2
212 if isinstance(i, tuple) or isinstance(i, frozenset):
213 x = self.immutable_depth(i)
214 if x > depth: depth = x
215 return depth + 1
216
218 """Handler
219
220 A customized handler for serialization and deserialization.
221 You can subclass it to extend cerealization support to new object.
222 See also ObjHandler."""
223
225 """Handler.collect(obj, dumper) -> bool
226
227 Collects all the objects referenced by OBJ.
228 For each objects ROBJ referenced by OBJ, calls collect method of the Handler for ROBJ's class,
229 i.e._HANDLERS_[ROBJ.__class__].collect(ROBJ, dumper).
230 Returns false if OBJ is already referenced (and thus no collection should occur); else returns true.
231 """
232 i = id(obj)
233 if not i in dumper.objs_id:
234 dumper.objs.append(obj)
235 dumper.objs_id.add(i)
236 return 1
237
239 """Handler.dump_obj(obj, dumper, s)
240
241 Dumps OBJ classname in file S."""
242 s.write(self.classname)
243
245 """Handler.dump_data(obj, dumper, s)
246
247 Dumps OBJ data in file S."""
248
250 """Handler.dump_ref(obj, dumper, s)
251
252 Write a reference to OBJ in file S.
253 You should not override dump_ref, since they is no corresponding 'undump_ref' that you
254 can override."""
255 s.write("r%s\n" % dumper.id2id[id(obj)])
256
258 """Handler.undump_obj(dumper, s)
259
260 Returns a new uninitialized (=no __init__'ed) instance of the class.
261 If you override undump_obj, DUMPER and file S can be used to read additional data
262 saved by Handler.dump_obj()."""
263
265 """Handler.undump_data(obj, dumper, s)
266
267 Reads the data for OBJ, from DUMPER and file S.
268 If you override undump_data, you should use DUMPER.undump_ref(S) to
269 read a reference or a basic type (=a string, an int,...)."""
270
271
273 - def collect (self, obj, dumper) : pass
274 - def dump_obj (self, obj, dumper, s): pass
276
279
281 - def dump_ref (self, obj, dumper, s): s.write("s%s\n%s" % (len(obj), obj))
282
285 obj = obj.encode("utf8")
286 s.write("u%s\n%s" % (len(obj), obj))
287
290
293
296
299
302 c = str(obj)
303 if c.startswith("("): c = c[1:-1]
304 s.write("c%s\n" % c)
305
306
308 classname = "tuple\n"
310 if not id(obj) in dumper.objs_id:
311 dumper.priorities_objs.append((dumper.immutable_depth(obj), obj))
312 dumper.objs_id.add(id(obj))
313
314 for i in obj: dumper.collect(i)
315 return 1
316
320
321 - def undump_obj(self, dumper, s): return tuple([dumper.undump_ref(s) for i in range(int(s.readline()))])
322
326
327
329 classname = "list\n"
334
338
340
342 for i in range(int(s.readline())): obj.append(dumper.undump_ref(s))
343
349
351 classname = "dict\n"
353 if Handler.collect(self, obj, dumper):
354 for i in obj.iterkeys (): dumper.collect(i)
355 for i in obj.itervalues(): dumper.collect(i)
356 return 1
357
363
365
369
370
372 """ObjHandler
373
374 A Cerealizer Handler that can support any new-style class instances, old-style class instances
375 as well as C-defined types (although it may not save the C-side data)."""
376 - def __init__(self, Class, classname = ""):
377 self.Class = Class
378 self.Class_new = getattr(Class, "__new__" , instance)
379 self.Class_getstate = getattr(Class, "__getstate__", None)
380 self.Class_setstate = getattr(Class, "__setstate__", None)
381 if classname: self.classname = "%s\n" % classname
382 else: self.classname = "%s.%s\n" % (Class.__module__, Class.__name__)
383
385 i = id(obj)
386 if not i in dumper.objs_id:
387 dumper.priorities_objs.append((-1, obj))
388 dumper.objs_id.add(i)
389
390 if self.Class_getstate: state = self.Class_getstate(obj)
391 else: state = obj.__dict__
392 dumper.obj2state[i] = state
393 dumper.collect(state)
394 return 1
395
399
400 - def undump_obj(self, dumper, s): return self.Class_new(self.Class)
401
403 if self.Class_setstate: self.Class_setstate(obj, dumper.undump_ref(s))
404 else: obj.__dict__ = dumper.undump_ref(s)
405
407 """SlotedObjHandler
408
409 A Cerealizer Handler that can support new-style class instances with __slot__."""
410 - def __init__(self, Class, classname = ""):
413
415 i = id(obj)
416 if not i in dumper.objs_id:
417 dumper.priorities_objs.append((-1, obj))
418 dumper.objs_id.add(i)
419
420 if self.Class_getstate: state = self.Class_getstate(obj)
421 else: state = dict([(slot, getattr(obj, slot, None)) for slot in self.Class_slots])
422 dumper.obj2state[i] = state
423 dumper.collect(state)
424 return 1
425
427 if self.Class_setstate: self.Class_setstate(obj, dumper.undump_ref(s))
428 else:
429 state = dumper.undump_ref(s)
430 for slot in self.Class_slots: setattr(obj, slot, state[slot])
431
433 """InitArgsObjHandler
434
435 A Cerealizer Handler that can support class instances with __getinitargs__."""
436 - def __init__(self, Class, classname = ""):
440
442 i = id(obj)
443 if not i in dumper.objs_id:
444 dumper.priorities_objs.append((-1, obj))
445 dumper.objs_id.add(i)
446
447 dumper.obj2state[i] = state = self.Class_getinitargs(obj)
448 dumper.collect(state)
449 return 1
450
452
454 """NewArgsObjHandler
455
456 A Cerealizer Handler that can support class instances with __getnewargs__."""
457 - def __init__(self, Class, classname = ""):
460
462 i = id(obj)
463 if not i in dumper.objs_id:
464 dumper.obj2newargs[i] = newargs = self.Class_getnewargs(obj)
465 dumper.collect(newargs)
466
467 dumper.priorities_objs.append((dumper.immutable_depth(newargs), obj))
468 dumper.objs_id.add(i)
469
470 if self.Class_getstate: state = self.Class_getstate(obj)
471 else: state = obj.__dict__
472 dumper.obj2state[i] = state
473 dumper.collect(state)
474 return 1
475
480
482
483
484 _configurable = 1
485 _HANDLERS = {}
486 _HANDLERS_ = {}
487 -def register(Class, handler = None, classname = ""):
488 """register(Class, handler = None, classname = "")
489
490 Registers CLASS as a serializable and secure class.
491 By calling register, YOU HAVE TO ASSUME THAT THE FOLLOWING METHODS ARE SECURE:
492 - CLASS.__new__
493 - CLASS.__del__
494 - CLASS.__getstate__
495 - CLASS.__setstate__
496 - CLASS.__getinitargs__
497 - CLASS.__init__ (only if CLASS.__getinitargs__ exists)
498
499 HANDLER is the Cerealizer Handler object that handles serialization and deserialization for Class.
500 If not given, Cerealizer create an instance of ObjHandler, which is suitable for old-style and
501 new_style Python class, and also C-defined types (although if it has some C-side data, you may
502 have to write a custom Handler or a __getstate__ and __setstate__ pair).
503
504 CLASSNAME is the classname used in Cerealizer files. It defaults to the full classname (module.class)
505 but you may choose something shorter -- as long as there is no risk of name clash."""
506 if not _configurable: raise StandardError("Cannot register new classes after freeze_configuration has been called!")
507 if "\n" in classname: raise ValueError("CLASSNAME cannot have \\n (Cerealizer automatically add a trailing \\n for performance reason)!")
508 if not handler:
509 if hasattr(Class, "__getnewargs__" ): handler = NewArgsObjHandler (Class, classname)
510 elif hasattr(Class, "__getinitargs__"): handler = InitArgsObjHandler(Class, classname)
511 elif hasattr(Class, "__slots__" ): handler = SlotedObjHandler (Class, classname)
512 else: handler = ObjHandler (Class, classname)
513 if _HANDLERS_.has_key(Class): raise ValueError("Class %s has already been registred!" % Class)
514 if not isinstance(handler, RefHandler):
515 if _HANDLERS .has_key(handler.classname): raise ValueError("A class has already been registred under the name %s!" % handler.classname[:-1])
516 _HANDLERS [handler.classname] = handler
517 if handler.__class__ is ObjHandler:
518 logger.info("Registring class %s as '%s'" % (Class, handler.classname[:-1]))
519 else:
520 logger.info("Registring class %s as '%s' (using %s)" % (Class, handler.classname[:-1], handler.__class__.__name__))
521 else:
522 logger.info("Registring reference '%s'" % Class)
523
524 _HANDLERS_[Class] = handler
525
526 register_class = register
527
529 """register_alias(Class, alias)
530
531 Registers ALIAS as an alias classname for CLASS.
532 Usefull for keeping backward compatibility in files: e.g. if you have renamed OldClass to
533 NewClass, just do:
534
535 cerealizer.register_alias(NewClass, "OldClass")
536
537 and you'll be able to open old files containing OldClass serialized."""
538 handler = _HANDLERS_.get(Class)
539 if not handler:
540 raise ValueError("Cannot register alias '%s' to Class %s: the class is not yet registred!" % (alias, Class))
541 if _HANDLERS.has_key(alias):
542 raise ValueError("Cannot register alias '%s' to Class %s: another class is already registred under the alias name!" % (alias, Class))
543 logger.info("Registring alias '%s' for %s" % (alias, Class))
544 _HANDLERS[alias + "\n"] = handler
545
546
548 """freeze_configuration()
549
550 Ends Cerealizer configuration. When freeze_configuration() is called, it is no longer possible
551 to register classes, using register().
552 Calling freeze_configuration() is not mandatory, but it may enforce security, by forbidding
553 unexpected calls to register()."""
554 global _configurable
555 _configurable = 0
556 logger.info("Configuration frozen")
557
558 register(type(None), NoneHandler ())
559 register(str , StrHandler ())
560 register(unicode , UnicodeHandler ())
561 register(bool , BoolHandler ())
562 register(int , IntHandler ())
563 register(long , LongHandler ())
564 register(float , FloatHandler ())
565 register(complex , ComplexHandler ())
566 register(dict , DictHandler ())
567 register(list , ListHandler ())
568 register(set , SetHandler ())
569 register(tuple , TupleHandler ())
570 register(frozenset , FrozensetHandler())
571
572
573 -def dump(obj, file, protocol = 0):
574 """dump(obj, file, protocol = 0)
575
576 Serializes object OBJ in FILE.
577 PROTOCOL is unused, it exists only for compatibility with Pickle."""
578 Dumper().dump(obj, file)
579
581 """load(file) -> obj
582
583 De-serializes an object from FILE."""
584 return Dumper().undump(file)
585
586 -def dumps(obj, protocol = 0):
587 """dumps(obj, protocol = 0) -> str
588
589 Serializes object OBJ and returns the serialized string.
590 PROTOCOL is unused, it exists only for compatibility with Pickle."""
591 s = StringIO()
592 Dumper().dump(obj, s)
593 return s.getvalue()
594
596 """loads(file) -> obj
597
598 De-serializes an object from STRING."""
599 return Dumper().undump(StringIO(string))
600
601
603 """dump_class_of_module(*modules)
604
605 Utility function; for each classes found in the given module, print the needed call to register."""
606 class D: pass
607 class O(object): pass
608 s = set([c for module in modules for c in module.__dict__.values() if isinstance(c, type(D)) or isinstance(c, type(O))])
609 l = ['cerealizer.register(%s.%s)' % (c.__module__, c.__name__) for c in s]
610 l.sort()
611 for i in l: print i
612