1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 """
40 Provides filesystem-related objects.
41 @sort: FilesystemList, BackupFileList, PurgeItemList
42 @author: Kenneth J. Pronovici <pronovic@ieee.org>
43 """
44
45
46
47
48
49
50
51 import sys
52 import os
53 import re
54 import sha
55 import logging
56 import tarfile
57
58
59 from CedarBackup2.knapsack import firstFit, bestFit, worstFit, alternateFit
60 from CedarBackup2.util import AbsolutePathList, ObjectTypeList, UnorderedList, RegexList
61 from CedarBackup2.util import removeKeys, displayBytes, calculateFileAge, encodePath
62
63
64
65
66
67
68 logger = logging.getLogger("CedarBackup2.log.filesystem")
69
70
71
72
73
74
76
77
78
79
80
81 """
82 Represents a list of filesystem items.
83
84 This is a generic class that represents a list of filesystem items. Callers
85 can add individual files or directories to the list, or can recursively add
86 the contents of a directory. The class also allows for up-front exclusions
87 in several forms (all files, all directories, all items matching a pattern,
88 all items whose basename matches a pattern, or all directories containing a
89 specific "ignore file"). Symbolic links are typically backed up
90 non-recursively, i.e. the link to a directory is backed up, but not the
91 contents of that link (we don't want to deal with recursive loops, etc.).
92
93 The custom methods such as L{addFile} will only add items if they exist on
94 the filesystem and do not match any exclusions that are already in place.
95 However, since a FilesystemList is a subclass of Python's standard list
96 class, callers can also add items to the list in the usual way, using
97 methods like C{append()} or C{insert()}. No validations apply to items
98 added to the list in this way; however, many list-manipulation methods deal
99 "gracefully" with items that don't exist in the filesystem, often by
100 ignoring them.
101
102 Once a list has been created, callers can remove individual items from the
103 list using standard methods like C{pop()} or C{remove()} or they can use
104 custom methods to remove specific types of entries or entries which match a
105 particular pattern.
106
107 @note: Regular expression patterns that apply to paths are assumed to be
108 bounded at front and back by the beginning and end of the string, i.e. they
109 are treated as if they begin with C{^} and end with C{$}. This is true
110 whether we are matching a complete path or a basename.
111
112 @note: Some platforms, like Windows, do not support soft links. On those
113 platforms, the ignore-soft-links flag can be set, but it won't do any good
114 because the operating system never reports a file as a soft link.
115
116 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
117 removeLinks, removeMatch, removeInvalid, normalize, validate,
118 excludeFiles, excludeDirs, excludeLinks, excludePaths,
119 excludePatterns, excludeBasenamePatterns, ignoreFile
120 """
121
122
123
124
125
126
144
145
146
147
148
149
151 """
152 Property target used to set the exclude files flag.
153 No validations, but we normalize the value to C{True} or C{False}.
154 """
155 if value:
156 self._excludeFiles = True
157 else:
158 self._excludeFiles = False
159
161 """
162 Property target used to get the exclude files flag.
163 """
164 return self._excludeFiles
165
167 """
168 Property target used to set the exclude directories flag.
169 No validations, but we normalize the value to C{True} or C{False}.
170 """
171 if value:
172 self._excludeDirs = True
173 else:
174 self._excludeDirs = False
175
177 """
178 Property target used to get the exclude directories flag.
179 """
180 return self._excludeDirs
181
183 """
184 Property target used to set the exclude soft links flag.
185 No validations, but we normalize the value to C{True} or C{False}.
186 """
187 if value:
188 self._excludeLinks = True
189 else:
190 self._excludeLinks = False
191
193 """
194 Property target used to get the exclude soft links flag.
195 """
196 return self._excludeLinks
197
199 """
200 Property target used to set the exclude paths list.
201 A C{None} value is converted to an empty list.
202 Elements do not have to exist on disk at the time of assignment.
203 @raise ValueError: If any list element is not an absolute path.
204 """
205 self._absoluteExcludePaths = AbsolutePathList()
206 if value is not None:
207 self._absoluteExcludePaths.extend(value)
208
210 """
211 Property target used to get the absolute exclude paths list.
212 """
213 return self._absoluteExcludePaths
214
216 """
217 Property target used to set the exclude patterns list.
218 A C{None} value is converted to an empty list.
219 """
220 self._excludePatterns = RegexList()
221 if value is not None:
222 self._excludePatterns.extend(value)
223
225 """
226 Property target used to get the exclude patterns list.
227 """
228 return self._excludePatterns
229
231 """
232 Property target used to set the exclude basename patterns list.
233 A C{None} value is converted to an empty list.
234 """
235 self._excludeBasenamePatterns = RegexList()
236 if value is not None:
237 self._excludeBasenamePatterns.extend(value)
238
240 """
241 Property target used to get the exclude basename patterns list.
242 """
243 return self._excludeBasenamePatterns
244
246 """
247 Property target used to set the ignore file.
248 The value must be a non-empty string if it is not C{None}.
249 @raise ValueError: If the value is an empty string.
250 """
251 if value is not None:
252 if len(value) < 1:
253 raise ValueError("The ignore file must be a non-empty string.")
254 self._ignoreFile = value
255
257 """
258 Property target used to get the ignore file.
259 """
260 return self._ignoreFile
261
262 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
263 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
264 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
265 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
266 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
267 "List of regular expression patterns (matching complete path) to be excluded.")
268 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
269 None, "List of regular expression patterns (matching basename) to be excluded.")
270 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
271
272
273
274
275
276
278 """
279 Adds a file to the list.
280
281 The path must exist and must be a file or a link to an existing file. It
282 will be added to the list subject to any exclusions that are in place.
283
284 @param path: File path to be added to the list
285 @type path: String representing a path on disk
286
287 @return: Number of items added to the list.
288
289 @raise ValueError: If path is not a file or does not exist.
290 @raise ValueError: If the path could not be encoded properly.
291 """
292 path = encodePath(path)
293 if not os.path.exists(path) or not os.path.isfile(path):
294 logger.debug("Path [%s] is not a file or does not exist on disk." % path)
295 raise ValueError("Path is not a file or does not exist on disk.")
296 if self.excludeLinks and os.path.islink(path):
297 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
298 return 0
299 if self.excludeFiles:
300 logger.debug("Path [%s] is excluded based on excludeFiles." % path)
301 return 0
302 if path in self.excludePaths:
303 logger.debug("Path [%s] is excluded based on excludePaths." % path)
304 return 0
305 for pattern in self.excludePatterns:
306 if re.compile(r"^%s$" % pattern).match(path):
307 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
308 return 0
309 for pattern in self.excludeBasenamePatterns:
310 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
311 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
312 return 0
313 self.append(path)
314 logger.debug("Added file to list: [%s]" % path)
315 return 1
316
318 """
319 Adds a directory to the list.
320
321 The path must exist and must be a directory or a link to an existing
322 directory. It will be added to the list subject to any exclusions that
323 are in place. The L{ignoreFile} does not apply to this method, only to
324 L{addDirContents}.
325
326 @param path: Directory path to be added to the list
327 @type path: String representing a path on disk
328
329 @return: Number of items added to the list.
330
331 @raise ValueError: If path is not a directory or does not exist.
332 @raise ValueError: If the path could not be encoded properly.
333 """
334 path = encodePath(path)
335 path = normalizeDir(path)
336 if not os.path.exists(path) or not os.path.isdir(path):
337 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
338 raise ValueError("Path is not a directory or does not exist on disk.")
339 if self.excludeLinks and os.path.islink(path):
340 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
341 return 0
342 if self.excludeDirs:
343 logger.debug("Path [%s] is excluded based on excludeDirs." % path)
344 return 0
345 if path in self.excludePaths:
346 logger.debug("Path [%s] is excluded based on excludePaths." % path)
347 return 0
348 for pattern in self.excludePatterns:
349 if re.compile(r"^%s$" % pattern).match(path):
350 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
351 return 0
352 for pattern in self.excludeBasenamePatterns:
353 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
354 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
355 return 0
356 self.append(path)
357 logger.debug("Added directory to list: [%s]" % path)
358 return 1
359
360 - def addDirContents(self, path, recursive=True, addSelf=True):
361 """
362 Adds the contents of a directory to the list.
363
364 The path must exist and must be a directory or a link to a directory.
365 The contents of the directory (as well as the directory path itself) will
366 be recursively added to the list, subject to any exclusions that are in
367 place. If you only want the directory and its immediate contents to be
368 added, then pass in C{recursive=False}.
369
370 @note: If a directory's absolute path matches an exclude pattern or path,
371 or if the directory contains the configured ignore file, then the
372 directory and all of its contents will be recursively excluded from the
373 list.
374
375 @note: If the passed-in directory happens to be a soft link, it will
376 still be recursed. However, any soft links I{within} the directory will
377 only be added by name, not recursively. Any invalid soft links (i.e.
378 soft links that point to non-existent items) will be silently ignored.
379
380 @note: The L{excludeDirs} flag only controls whether any given directory
381 path itself is added to the list once it has been discovered. It does
382 I{not} modify any behavior related to directory recursion.
383
384 @param path: Directory path whose contents should be added to the list
385 @type path: String representing a path on disk
386
387 @param recursive: Indicates whether directory contents should be added recursively.
388 @type recursive: Boolean value
389
390 @param addSelf: Indicates whether the directory itself should be added to the list.
391 @type addSelf: Boolean value
392
393 @return: Number of items recursively added to the list
394
395 @raise ValueError: If path is not a directory or does not exist.
396 @raise ValueError: If the path could not be encoded properly.
397 """
398 path = encodePath(path)
399 path = normalizeDir(path)
400 return self._addDirContentsInternal(path, recursive=recursive, includePath=addSelf)
401
402 - def _addDirContentsInternal(self, path, includePath=True, recursive=True):
403 """
404 Internal implementation of C{addDirContents}.
405
406 This internal implementation exists due to some refactoring. Basically,
407 some subclasses have a need to add the contents of a directory, but not
408 the directory itself. This is different than the standard C{FilesystemList}
409 behavior and actually ends up making a special case out of the first
410 call in the recursive chain. Since I don't want to expose the modified
411 interface, C{addDirContents} ends up being wholly implemented in terms
412 of this method.
413
414 @param path: Directory path whose contents should be added to the list.
415 @param includePath: Indicates whether to include the path as well as contents.
416 @param recursive: Indicates whether directory contents should be added recursively.
417
418 @return: Number of items recursively added to the list
419
420 @raise ValueError: If path is not a directory or does not exist.
421 """
422 added = 0
423 if not os.path.exists(path) or not os.path.isdir(path):
424 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
425 raise ValueError("Path is not a directory or does not exist on disk.")
426 if path in self.excludePaths:
427 logger.debug("Path [%s] is excluded based on excludePaths." % path)
428 return added
429 for pattern in self.excludePatterns:
430 if re.compile(r"^%s$" % pattern).match(path):
431 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
432 return added
433 for pattern in self.excludeBasenamePatterns:
434 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
435 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
436 return added
437 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
438 logger.debug("Path [%s] is excluded based on ignore file." % path)
439 return added
440 if includePath:
441 added += self.addDir(path)
442 for entry in os.listdir(path):
443 entrypath = os.path.join(path, entry)
444 if os.path.isfile(entrypath):
445 added += self.addFile(entrypath)
446 elif os.path.isdir(entrypath):
447 if os.path.islink(entrypath):
448 added += self.addDir(entrypath)
449 else:
450 if recursive:
451 added += self._addDirContentsInternal(entrypath)
452 else:
453 added += self.addDir(entrypath)
454 return added
455
456
457
458
459
460
462 """
463 Removes file entries from the list.
464
465 If C{pattern} is not passed in or is C{None}, then all file entries will
466 be removed from the list. Otherwise, only those file entries matching
467 the pattern will be removed. Any entry which does not exist on disk
468 will be ignored (use L{removeInvalid} to purge those entries).
469
470 This method might be fairly slow for large lists, since it must check the
471 type of each item in the list. If you know ahead of time that you want
472 to exclude all files, then you will be better off setting L{excludeFiles}
473 to C{True} before adding items to the list.
474
475 @param pattern: Regular expression pattern representing entries to remove
476
477 @return: Number of entries removed
478 @raise ValueError: If the passed-in pattern is not a valid regular expression.
479 """
480 removed = 0
481 if pattern is None:
482 for entry in self[:]:
483 if os.path.exists(entry) and os.path.isfile(entry):
484 self.remove(entry)
485 logger.debug("Removed path [%s] from list." % entry)
486 removed += 1
487 else:
488 try:
489 compiled = re.compile(pattern)
490 except re.error:
491 raise ValueError("Pattern is not a valid regular expression.")
492 for entry in self[:]:
493 if os.path.exists(entry) and os.path.isfile(entry):
494 if compiled.match(entry):
495 self.remove(entry)
496 logger.debug("Removed path [%s] from list." % entry)
497 removed += 1
498 logger.debug("Removed a total of %d entries." % removed);
499 return removed
500
502 """
503 Removes directory entries from the list.
504
505 If C{pattern} is not passed in or is C{None}, then all directory entries
506 will be removed from the list. Otherwise, only those directory entries
507 matching the pattern will be removed. Any entry which does not exist on
508 disk will be ignored (use L{removeInvalid} to purge those entries).
509
510 This method might be fairly slow for large lists, since it must check the
511 type of each item in the list. If you know ahead of time that you want
512 to exclude all directories, then you will be better off setting
513 L{excludeDirs} to C{True} before adding items to the list (note that this
514 will not prevent you from recursively adding the I{contents} of
515 directories).
516
517 @param pattern: Regular expression pattern representing entries to remove
518
519 @return: Number of entries removed
520 @raise ValueError: If the passed-in pattern is not a valid regular expression.
521 """
522 removed = 0
523 if pattern is None:
524 for entry in self[:]:
525 if os.path.exists(entry) and os.path.isdir(entry):
526 self.remove(entry)
527 logger.debug("Removed path [%s] from list." % entry)
528 removed += 1
529 else:
530 try:
531 compiled = re.compile(pattern)
532 except re.error:
533 raise ValueError("Pattern is not a valid regular expression.")
534 for entry in self[:]:
535 if os.path.exists(entry) and os.path.isdir(entry):
536 if compiled.match(entry):
537 self.remove(entry)
538 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
539 removed += 1
540 logger.debug("Removed a total of %d entries." % removed);
541 return removed
542
544 """
545 Removes soft link entries from the list.
546
547 If C{pattern} is not passed in or is C{None}, then all soft link entries
548 will be removed from the list. Otherwise, only those soft link entries
549 matching the pattern will be removed. Any entry which does not exist on
550 disk will be ignored (use L{removeInvalid} to purge those entries).
551
552 This method might be fairly slow for large lists, since it must check the
553 type of each item in the list. If you know ahead of time that you want
554 to exclude all soft links, then you will be better off setting
555 L{excludeLinks} to C{True} before adding items to the list.
556
557 @param pattern: Regular expression pattern representing entries to remove
558
559 @return: Number of entries removed
560 @raise ValueError: If the passed-in pattern is not a valid regular expression.
561 """
562 removed = 0
563 if pattern is None:
564 for entry in self[:]:
565 if os.path.exists(entry) and os.path.islink(entry):
566 self.remove(entry)
567 logger.debug("Removed path [%s] from list." % entry)
568 removed += 1
569 else:
570 try:
571 compiled = re.compile(pattern)
572 except re.error:
573 raise ValueError("Pattern is not a valid regular expression.")
574 for entry in self[:]:
575 if os.path.exists(entry) and os.path.islink(entry):
576 if compiled.match(entry):
577 self.remove(entry)
578 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
579 removed += 1
580 logger.debug("Removed a total of %d entries." % removed);
581 return removed
582
584 """
585 Removes from the list all entries matching a pattern.
586
587 This method removes from the list all entries which match the passed in
588 C{pattern}. Since there is no need to check the type of each entry, it
589 is faster to call this method than to call the L{removeFiles},
590 L{removeDirs} or L{removeLinks} methods individually. If you know which
591 patterns you will want to remove ahead of time, you may be better off
592 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
593 items to the list.
594
595 @note: Unlike when using the exclude lists, the pattern here is I{not}
596 bounded at the front and the back of the string. You can use any pattern
597 you want.
598
599 @param pattern: Regular expression pattern representing entries to remove
600
601 @return: Number of entries removed.
602 @raise ValueError: If the passed-in pattern is not a valid regular expression.
603 """
604 try:
605 compiled = re.compile(pattern)
606 except re.error:
607 raise ValueError("Pattern is not a valid regular expression.")
608 removed = 0
609 for entry in self[:]:
610 if compiled.match(entry):
611 self.remove(entry)
612 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
613 removed += 1
614 logger.debug("Removed a total of %d entries." % removed);
615 return removed
616
618 """
619 Removes from the list all entries that do not exist on disk.
620
621 This method removes from the list all entries which do not currently
622 exist on disk in some form. No attention is paid to whether the entries
623 are files or directories.
624
625 @return: Number of entries removed.
626 """
627 removed = 0
628 for entry in self[:]:
629 if not os.path.exists(entry):
630 self.remove(entry)
631 logger.debug("Removed path [%s] from list." % entry)
632 removed += 1
633 logger.debug("Removed a total of %d entries." % removed);
634 return removed
635
636
637
638
639
640
642 """Normalizes the list, ensuring that each entry is unique."""
643 orig = len(self)
644 self.sort()
645 dups = filter(lambda x, self=self: self[x] == self[x+1], range(0, len(self) - 1))
646 items = map(lambda x, self=self: self[x], dups)
647 map(self.remove, items)
648 new = len(self)
649 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now)." % (new-orig, orig, new))
650
652 """
653 Verifies that all entries in the list exist on disk.
654 @return: C{True} if all entries exist, C{False} otherwise.
655 """
656 for entry in self:
657 if not os.path.exists(entry):
658 logger.debug("Path [%s] is invalid; list is not valid." % entry)
659 return False
660 logger.debug("All entries in list are valid.")
661 return True
662
663
664
665
666
667
669 """
670 Item returned by L{BackupFileList.generateSpan}.
671 """
672 - def __init__(self, fileList, size, capacity, utilization):
673 """
674 Create object.
675 @param fileList: List of files
676 @param size: Size (in bytes) of files
677 @param utilization: Utilization, as a percentage (0-100)
678 """
679 self.fileList = fileList
680 self.size = size
681 self.capacity = capacity
682 self.utilization = utilization
683
684
685
686
687
688
690
691
692
693
694
695 """
696 List of files to be backed up.
697
698 A BackupFileList is a L{FilesystemList} containing a list of files to be
699 backed up. It only contains files, not directories (soft links are treated
700 like files). On top of the generic functionality provided by
701 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
702 for each file in the list, and it also provides a method to calculate the
703 total size of the files in the list and a way to export the list into tar
704 form.
705
706 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
707 generateFitted, generateTarfile, removeUnchanged
708 """
709
710
711
712
713
717
718
719
720
721
722
724 """
725 Adds a directory to the list.
726
727 Note that this class does not allow directories to be added by themselves
728 (a backup list contains only files). However, since links to directories
729 are technically files, we allow them to be added.
730
731 This method is implemented in terms of the superclass method, with one
732 additional validation: the superclass method is only called if the
733 passed-in path is both a directory and a link. All of the superclass's
734 existing validations and restrictions apply.
735
736 @param path: Directory path to be added to the list
737 @type path: String representing a path on disk
738
739 @return: Number of items added to the list.
740
741 @raise ValueError: If path is not a directory or does not exist.
742 @raise ValueError: If the path could not be encoded properly.
743 """
744 path = encodePath(path)
745 path = normalizeDir(path)
746 if os.path.isdir(path) and not os.path.islink(path):
747 return 0
748 else:
749 return FilesystemList.addDir(self, path)
750
751
752
753
754
755
757 """
758 Returns the total size among all files in the list.
759 Only files are counted.
760 Soft links that point at files are ignored.
761 Entries which do not exist on disk are ignored.
762 @return: Total size, in bytes
763 """
764 total = 0.0
765 for entry in self:
766 if os.path.isfile(entry) and not os.path.islink(entry):
767 total += float(os.stat(entry).st_size)
768 return total
769
771 """
772 Generates a mapping from file to file size in bytes.
773 The mapping does include soft links, which are listed with size zero.
774 Entries which do not exist on disk are ignored.
775 @return: Dictionary mapping file to file size
776 """
777 table = { }
778 for entry in self:
779 if os.path.islink(entry):
780 table[entry] = 0.0
781 elif os.path.isfile(entry):
782 table[entry] = float(os.stat(entry).st_size)
783 return table
784
786 """
787 Generates a mapping from file to file digest.
788
789 Currently, the digest is an SHA hash, which should be pretty secure. In
790 the future, this might be a different kind of hash, but we guarantee that
791 the type of the hash will not change unless the library major version
792 number is bumped.
793
794 Entries which do not exist on disk are ignored.
795
796 Soft links are ignored. We would end up generating a digest for the file
797 that the soft link points at, which doesn't make any sense.
798
799 If C{stripPrefix} is passed in, then that prefix will be stripped from
800 each key when the map is generated. This can be useful in generating two
801 "relative" digest maps to be compared to one another.
802
803 @param stripPrefix: Common prefix to be stripped from paths
804 @type stripPrefix: String with any contents
805
806 @return: Dictionary mapping file to digest value
807 @see: L{removeUnchanged}
808 """
809 table = { }
810 if stripPrefix is not None:
811 for entry in self:
812 if os.path.isfile(entry) and not os.path.islink(entry):
813 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
814 else:
815 for entry in self:
816 if os.path.isfile(entry) and not os.path.islink(entry):
817 table[entry] = BackupFileList._generateDigest(entry)
818 return table
819
821 """
822 Generates an SHA digest for a given file on disk.
823
824 The original code for this function used this simplistic implementation,
825 which requires reading the entire file into memory at once in order to
826 generate a digest value::
827
828 sha.new(open(path).read()).hexdigest()
829
830 Not surprisingly, this isn't an optimal solution. The U{Simple file
831 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
832 Python Cookbook recipe describes how to incrementally generate a hash
833 value by reading in chunks of data rather than reading the file all at
834 once. The recipe relies on the the C{update()} method of the various
835 Python hashing algorithms.
836
837 In my tests using a 110 MB file on CD, the original implementation
838 requires 111 seconds. This implementation requires only 40-45 seconds,
839 which is a pretty substantial speed-up.
840
841 Practice shows that reading in around 4kB (4096 bytes) at a time yields
842 the best performance. Smaller reads are quite a bit slower, and larger
843 reads don't make much of a difference. The 4kB number makes me a little
844 suspicious, and I think it might be related to the size of a filesystem
845 read at the hardware level. However, I've decided to just hardcode 4096
846 until I have evidence that shows it's worthwhile making the read size
847 configurable.
848
849 @param path: Path to generate digest for.
850
851 @return: ASCII-safe SHA digest for the file.
852 @raise OSError: If the file cannot be opened.
853 """
854 s = sha.new()
855 f = open(path, mode="rb")
856 readBytes = 4096
857 while(readBytes > 0):
858 readString = f.read(readBytes)
859 s.update(readString)
860 readBytes = len(readString)
861 f.close()
862 digest = s.hexdigest()
863 logger.debug("Generated digest [%s] for file [%s]." % (digest, path))
864 return digest
865 _generateDigest = staticmethod(_generateDigest)
866
868 """
869 Generates a list of items that fit in the indicated capacity.
870
871 Sometimes, callers would like to include every item in a list, but are
872 unable to because not all of the items fit in the space available. This
873 method returns a copy of the list, containing only the items that fit in
874 a given capacity. A copy is returned so that we don't lose any
875 information if for some reason the fitted list is unsatisfactory.
876
877 The fitting is done using the functions in the knapsack module. By
878 default, the first fit algorithm is used, but you can also choose
879 from best fit, worst fit and alternate fit.
880
881 @param capacity: Maximum capacity among the files in the new list
882 @type capacity: Integer, in bytes
883
884 @param algorithm: Knapsack (fit) algorithm to use
885 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
886
887 @return: Copy of list with total size no larger than indicated capacity
888 @raise ValueError: If the algorithm is invalid.
889 """
890 table = self._getKnapsackTable()
891 function = BackupFileList._getKnapsackFunction(algorithm)
892 return function(table, capacity)[0]
893
895 """
896 Splits the list of items into sub-lists that fit in a given capacity.
897
898 Sometimes, callers need split to a backup file list into a set of smaller
899 lists. For instance, you could use this to "span" the files across a set
900 of discs.
901
902 The fitting is done using the functions in the knapsack module. By
903 default, the first fit algorithm is used, but you can also choose
904 from best fit, worst fit and alternate fit.
905
906 @note: If any of your items are larger than the capacity, then it won't
907 be possible to find a solution. In this case, a value error will be
908 raised.
909
910 @param capacity: Maximum capacity among the files in the new list
911 @type capacity: Integer, in bytes
912
913 @param algorithm: Knapsack (fit) algorithm to use
914 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
915
916 @return: List of L{SpanItem} objects.
917
918 @raise ValueError: If the algorithm is invalid.
919 @raise ValueError: If it's not possible to fit some items
920 """
921 spanItems = []
922 function = BackupFileList._getKnapsackFunction(algorithm)
923 table = self._getKnapsackTable(capacity)
924 iteration = 0
925 while len(table) > 0:
926 iteration += 1
927 fit = function(table, capacity)
928 if len(fit[0]) == 0:
929
930 raise ValueError("After iteration %d, unable to add any new items." % iteration)
931 removeKeys(table, fit[0])
932 utilization = (float(fit[1])/float(capacity))*100.0
933 item = SpanItem(fit[0], fit[1], capacity, utilization)
934 spanItems.append(item)
935 return spanItems
936
938 """
939 Converts the list into the form needed by the knapsack algorithms.
940 @return: Dictionary mapping file name to tuple of (file path, file size).
941 """
942 table = { }
943 for entry in self:
944 if os.path.islink(entry):
945 table[entry] = (entry, 0.0)
946 elif os.path.isfile(entry):
947 size = float(os.stat(entry).st_size)
948 if capacity is not None:
949 if size > capacity:
950 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
951 table[entry] = (entry, size)
952 return table
953
955 """
956 Returns a reference to the function associated with an algorithm name.
957 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
958 @param algorithm: Name of the algorithm
959 @return: Reference to knapsack function
960 @raise ValueError: If the algorithm name is unknown.
961 """
962 if algorithm == "first_fit":
963 return firstFit
964 elif algorithm == "best_fit":
965 return bestFit
966 elif algorithm == "worst_fit":
967 return worstFit
968 elif algorithm == "alternate_fit":
969 return alternateFit
970 else:
971 raise ValueError("Algorithm [%s] is invalid." % algorithm);
972 _getKnapsackFunction = staticmethod(_getKnapsackFunction)
973
975 """
976 Creates a tar file containing the files in the list.
977
978 By default, this method will create uncompressed tar files. If you pass
979 in mode C{'targz'}, then it will create gzipped tar files, and if you
980 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
981
982 The tar file will be created as a GNU tar archive, which enables extended
983 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
984 the extra functionality out-weighs the disadvantage of not being
985 "standard".
986
987 If you pass in C{flat=True}, then a "flat" archive will be created, and
988 all of the files will be added to the root of the archive. So, the file
989 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
990
991 By default, the whole method call fails if there are problems adding any
992 of the files to the archive, resulting in an exception. Under these
993 circumstances, callers are advised that they might want to call
994 L{removeInvalid()} and then attempt to extract the tar file a second
995 time, since the most common cause of failures is a missing file (a file
996 that existed when the list was built, but is gone again by the time the
997 tar file is built).
998
999 If you want to, you can pass in C{ignore=True}, and the method will
1000 ignore errors encountered when adding individual files to the archive
1001 (but not errors opening and closing the archive itself).
1002
1003 We'll always attempt to remove the tarfile from disk if an exception will
1004 be thrown.
1005
1006 @note: No validation is done as to whether the entries in the list are
1007 files, since only files or soft links should be in an object like this.
1008 However, to be safe, everything is explicitly added to the tar archive
1009 non-recursively so it's safe to include soft links to directories.
1010
1011 @note: The Python C{tarfile} module, which is used internally here, is
1012 supposed to deal properly with long filenames and links. In my testing,
1013 I have found that it appears to be able to add long really long filenames
1014 to archives, but doesn't do a good job reading them back out, even out of
1015 an archive it created. Fortunately, all Cedar Backup does is add files
1016 to archives.
1017
1018 @param path: Path of tar file to create on disk
1019 @type path: String representing a path on disk
1020
1021 @param mode: Tar creation mode
1022 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1023
1024 @param ignore: Indicates whether to ignore certain errors.
1025 @type ignore: Boolean
1026
1027 @param flat: Creates "flat" archive by putting all items in root
1028 @type flat: Boolean
1029
1030 @raise ValueError: If mode is not valid
1031 @raise ValueError: If list is empty
1032 @raise ValueError: If the path could not be encoded properly.
1033 @raise TarError: If there is a problem creating the tar file
1034 """
1035 path = encodePath(path)
1036 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1037 if(mode == 'tar'): tarmode = "w:"
1038 elif(mode == 'targz'): tarmode = "w:gz"
1039 elif(mode == 'tarbz2'): tarmode = "w:bz2"
1040 else: raise ValueError("Mode [%s] is not valid." % mode)
1041 try:
1042 tar = tarfile.open(path, tarmode)
1043 tar.posix = False
1044 for entry in self:
1045 try:
1046 if flat:
1047 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1048 else:
1049 tar.add(entry, recursive=False)
1050 except tarfile.TarError, e:
1051 if not ignore:
1052 raise e
1053 logger.info("Unable to add file [%s]; going on anyway." % entry)
1054 except OSError, e:
1055 if not ignore:
1056 raise tarfile.TarError(e)
1057 logger.info("Unable to add file [%s]; going on anyway." % entry)
1058 tar.close()
1059 except tarfile.ReadError, e:
1060 try: tar.close()
1061 except: pass
1062 if os.path.exists(path):
1063 try: os.remove(path)
1064 except: pass
1065 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1066 except tarfile.TarError, e:
1067 try: tar.close()
1068 except: pass
1069 if os.path.exists(path):
1070 try: os.remove(path)
1071 except: pass
1072 raise e
1073
1075 """
1076 Removes unchanged entries from the list.
1077
1078 This method relies on a digest map as returned from L{generateDigestMap}.
1079 For each entry in C{digestMap}, if the entry also exists in the current
1080 list I{and} the entry in the current list has the same digest value as in
1081 the map, the entry in the current list will be removed.
1082
1083 This method offers a convenient way for callers to filter unneeded
1084 entries from a list. The idea is that a caller will capture a digest map
1085 from C{generateDigestMap} at some point in time (perhaps the beginning of
1086 the week), and will save off that map using C{pickle} or some other
1087 method. Then, the caller could use this method sometime in the future to
1088 filter out any unchanged files based on the saved-off map.
1089
1090 If C{captureDigest} is passed-in as C{True}, then digest information will
1091 be captured for the entire list before the removal step occurs using the
1092 same rules as in L{generateDigestMap}. The check will involve a lookup
1093 into the complete digest map.
1094
1095 If C{captureDigest} is passed in as C{False}, we will only generate a
1096 digest value for files we actually need to check, and we'll ignore any
1097 entry in the list which isn't a file that currently exists on disk.
1098
1099 The return value varies depending on C{captureDigest}, as well. To
1100 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1101 we'll just return a single value representing the number of entries
1102 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1103 map)}. The returned digest map will be in exactly the form returned by
1104 L{generateDigestMap}.
1105
1106 @note: For performance reasons, this method actually ends up rebuilding
1107 the list from scratch. First, we build a temporary dictionary containing
1108 all of the items from the original list. Then, we remove items as needed
1109 from the dictionary (which is faster than the equivalent operation on a
1110 list). Finally, we replace the contents of the current list based on the
1111 keys left in the dictionary. This should be transparent to the caller.
1112
1113 @param digestMap: Dictionary mapping file name to digest value.
1114 @type digestMap: Map as returned from L{generateDigestMap}.
1115
1116 @param captureDigest: Indicates that digest information should be captured.
1117 @type captureDigest: Boolean
1118
1119 @return: Number of entries removed
1120 """
1121 if captureDigest:
1122 removed = 0
1123 table = {}
1124 captured = {}
1125 for entry in self:
1126 if os.path.isfile(entry) and not os.path.islink(entry):
1127 table[entry] = BackupFileList._generateDigest(entry)
1128 captured[entry] = table[entry]
1129 else:
1130 table[entry] = None
1131 for entry in digestMap.keys():
1132 if table.has_key(entry):
1133 if table[entry] is not None:
1134 digest = table[entry]
1135 if digest == digestMap[entry]:
1136 removed += 1
1137 del table[entry]
1138 logger.debug("Discarded unchanged file [%s]." % entry)
1139 self[:] = table.keys()
1140 return (removed, captured)
1141 else:
1142 removed = 0
1143 table = {}
1144 for entry in self:
1145 table[entry] = None
1146 for entry in digestMap.keys():
1147 if table.has_key(entry):
1148 if os.path.isfile(entry) and not os.path.islink(entry):
1149 digest = BackupFileList._generateDigest(entry)
1150 if digest == digestMap[entry]:
1151 removed += 1
1152 del table[entry]
1153 logger.debug("Discarded unchanged file [%s]." % entry)
1154 self[:] = table.keys()
1155 return removed
1156
1157
1158
1159
1160
1161
1163
1164
1165
1166
1167
1168 """
1169 List of files and directories to be purged.
1170
1171 A PurgeItemList is a L{FilesystemList} containing a list of files and
1172 directories to be purged. On top of the generic functionality provided by
1173 L{FilesystemList}, this class adds functionality to remove items that are
1174 too young to be purged, and to actually remove each item in the list from
1175 the filesystem.
1176
1177 The other main difference is that when you add a directory's contents to a
1178 purge item list, the directory itself is not added to the list. This way,
1179 if someone asks to purge within in C{/opt/backup/collect}, that directory
1180 doesn't get removed once all of the files within it is gone.
1181 """
1182
1183
1184
1185
1186
1190
1191
1192
1193
1194
1195
1196 - def addDirContents(self, path, recursive=True, addSelf=False):
1197 """
1198 Adds the contents of a directory to the list.
1199
1200 The path must exist and must be a directory or a link to a directory.
1201 The contents of the directory (but I{not} the directory path itself) will
1202 be recursively added to the list, subject to any exclusions that are in
1203 place. If you only want the directory and its contents to be added, then
1204 pass in C{recursive=False}.
1205
1206 @note: If a directory's absolute path matches an exclude pattern or path,
1207 or if the directory contains the configured ignore file, then the
1208 directory and all of its contents will be recursively excluded from the
1209 list.
1210
1211 @note: If the passed-in directory happens to be a soft link, it will
1212 still be recursed. However, any soft links I{within} the directory will
1213 only be added by name, not recursively. Any invalid soft links (i.e.
1214 soft links that point to non-existent items) will be silently ignored.
1215
1216 @note: The L{excludeDirs} flag only controls whether any given soft link
1217 path itself is added to the list once it has been discovered. It does
1218 I{not} modify any behavior related to directory recursion.
1219
1220 @note: The L{excludeDirs} flag only controls whether any given directory
1221 path itself is added to the list once it has been discovered. It does
1222 I{not} modify any behavior related to directory recursion.
1223
1224 @param path: Directory path whose contents should be added to the list
1225 @type path: String representing a path on disk
1226
1227 @param recursive: Indicates whether directory contents should be added recursively.
1228 @type recursive: Boolean value
1229
1230 @param addSelf: Ignored in this subclass.
1231
1232 @return: Number of items recursively added to the list
1233
1234 @raise ValueError: If path is not a directory or does not exist.
1235 @raise ValueError: If the path could not be encoded properly.
1236 """
1237 path = encodePath(path)
1238 path = normalizeDir(path)
1239 return super(PurgeItemList, self)._addDirContentsInternal(path, includePath=False, recursive=recursive)
1240
1241
1242
1243
1244
1245
1247 """
1248 Removes from the list files younger than a certain age (in days).
1249
1250 Any file whose "age" in days is less than (C{<}) the value of the
1251 C{daysOld} parameter will be removed from the list so that it will not be
1252 purged later when L{purgeItems} is called. Directories and soft links
1253 will be ignored.
1254
1255 The "age" of a file is the amount of time since the file was last used,
1256 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1257
1258 @note: Some people find the "sense" of this method confusing or
1259 "backwards". Keep in mind that this method is used to remove items
1260 I{from the list}, not from the filesystem! It removes from the list
1261 those items that you would I{not} want to purge because they are too
1262 young. As an example, passing in C{daysOld} of zero (0) would remove
1263 from the list no files, which would result in purging all of the files
1264 later. I would be happy to make a synonym of this method with an
1265 easier-to-understand "sense", if someone can suggest one.
1266
1267 @param daysOld: Minimum age of files that are to be kept in the list.
1268 @type daysOld: Integer value >= 0.
1269
1270 @return: Number of entries removed
1271 """
1272 removed = 0
1273 daysOld = int(daysOld)
1274 if daysOld < 0:
1275 raise ValueError("Days old value must be an integer >= 0.")
1276 for entry in self[:]:
1277 if os.path.isfile(entry) and not os.path.islink(entry):
1278 try:
1279 age = calculateFileAge(entry)
1280 if age < daysOld:
1281 removed += 1
1282 self.remove(entry)
1283 except OSError:
1284 pass
1285 return removed
1286
1288 """
1289 Purges all items in the list.
1290
1291 Every item in the list will be purged. Directories in the list will
1292 I{not} be purged recursively, and hence will only be removed if they are
1293 empty. Errors will be ignored.
1294
1295 To faciliate easy removal of directories that will end up being empty,
1296 the delete process happens in two passes: files first (including soft
1297 links), then directories.
1298
1299 @return: Tuple containing count of (files, dirs) removed
1300 """
1301 files = 0
1302 dirs = 0
1303 for entry in self:
1304 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1305 try:
1306 os.remove(entry)
1307 files += 1
1308 logger.debug("Purged file [%s]." % entry)
1309 except OSError:
1310 pass
1311 for entry in self:
1312 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1313 try:
1314 os.rmdir(entry)
1315 dirs += 1
1316 logger.debug("Purged empty directory [%s]." % entry)
1317 except OSError:
1318 pass
1319 return (files, dirs)
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1331 """
1332 Normalizes a directory name.
1333
1334 For our purposes, a directory name is normalized by removing the trailing
1335 path separator, if any. This is important because we want directories to
1336 appear within lists in a consistent way, although from the user's
1337 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1338
1339 @param path: Path to be normalized.
1340 @type path: String representing a path on disk
1341
1342 @return: Normalized path, which should be equivalent to the original.
1343 """
1344 if path != os.sep and path[-1:] == os.sep:
1345 return path[:-1]
1346 return path
1347
1348
1349
1350
1351
1352
1353 -def compareContents(path1, path2, verbose=False):
1354 """
1355 Compares the contents of two directories to see if they are equivalent.
1356
1357 The two directories are recursively compared. First, we check whether they
1358 contain exactly the same set of files. Then, we check to see every given
1359 file has exactly the same contents in both directories.
1360
1361 This is all relatively simple to implement through the magic of
1362 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1363 off the front of each entry in the mapping it generates. This makes our
1364 comparison as simple as creating a list for each path, then generating a
1365 digest map for each path and comparing the two.
1366
1367 If no exception is thrown, the two directories are considered identical.
1368
1369 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1370 used so that any thrown exception can indicate exactly which file caused the
1371 comparison to fail. The thrown C{ValueError} exception distinguishes
1372 between the directories containing different files, and containing the same
1373 files with differing content.
1374
1375 @param path1: First path to compare.
1376 @type path1: String representing a path on disk
1377
1378 @param path2: First path to compare.
1379 @type path2: String representing a path on disk
1380
1381 @param verbose: Indicates whether a verbose response should be given.
1382 @type verbose: Boolean
1383
1384 @raise ValueError: If a directory doesn't exist or can't be read.
1385 @raise ValueError: If the two directories are not equivalent.
1386 @raise IOError: If there is an unusual problem reading the directories.
1387 """
1388 try:
1389 path1List = BackupFileList()
1390 path1List.addDirContents(path1)
1391 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1392 path2List = BackupFileList()
1393 path2List.addDirContents(path2)
1394 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1395 compareDigestMaps(path1Digest, path2Digest, verbose)
1396 except IOError, e:
1397 logger.error("I/O error encountered during consistency check.")
1398 raise e
1399
1401 """
1402 Compares two digest maps and throws an exception if they differ.
1403
1404 @param digest1: First digest to compare.
1405 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1406
1407 @param digest2: Second digest to compare.
1408 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1409
1410 @param verbose: Indicates whether a verbose response should be given.
1411 @type verbose: Boolean
1412
1413 @raise ValueError: If the two directories are not equivalent.
1414 """
1415 if not verbose:
1416 if digest1 != digest2:
1417 raise ValueError("Consistency check failed.")
1418 else:
1419 list1 = UnorderedList(digest1.keys())
1420 list2 = UnorderedList(digest2.keys())
1421 if list1 != list2:
1422 raise ValueError("Directories contain a different set of files.")
1423 for key in list1:
1424 if digest1[key] != digest2[key]:
1425 raise ValueError("File contents for [%s] vary between directories." % key)
1426