1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39 """
40 Provides filesystem-related objects.
41 @sort: FilesystemList, BackupFileList, PurgeItemList
42 @author: Kenneth J. Pronovici <pronovic@ieee.org>
43 """
44
45
46
47
48
49
50
51 import sys
52 import os
53 import re
54 import sha
55 import math
56 import logging
57 import tarfile
58
59
60 from CedarBackup2.knapsack import firstFit, bestFit, worstFit, alternateFit
61 from CedarBackup2.util import AbsolutePathList, ObjectTypeList, UnorderedList, RegexList
62 from CedarBackup2.util import removeKeys, displayBytes, calculateFileAge, encodePath, dereferenceLink
63
64
65
66
67
68
69 logger = logging.getLogger("CedarBackup2.log.filesystem")
70
71
72
73
74
75
77
78
79
80
81
82 """
83 Represents a list of filesystem items.
84
85 This is a generic class that represents a list of filesystem items. Callers
86 can add individual files or directories to the list, or can recursively add
87 the contents of a directory. The class also allows for up-front exclusions
88 in several forms (all files, all directories, all items matching a pattern,
89 all items whose basename matches a pattern, or all directories containing a
90 specific "ignore file"). Symbolic links are typically backed up
91 non-recursively, i.e. the link to a directory is backed up, but not the
92 contents of that link (we don't want to deal with recursive loops, etc.).
93
94 The custom methods such as L{addFile} will only add items if they exist on
95 the filesystem and do not match any exclusions that are already in place.
96 However, since a FilesystemList is a subclass of Python's standard list
97 class, callers can also add items to the list in the usual way, using
98 methods like C{append()} or C{insert()}. No validations apply to items
99 added to the list in this way; however, many list-manipulation methods deal
100 "gracefully" with items that don't exist in the filesystem, often by
101 ignoring them.
102
103 Once a list has been created, callers can remove individual items from the
104 list using standard methods like C{pop()} or C{remove()} or they can use
105 custom methods to remove specific types of entries or entries which match a
106 particular pattern.
107
108 @note: Regular expression patterns that apply to paths are assumed to be
109 bounded at front and back by the beginning and end of the string, i.e. they
110 are treated as if they begin with C{^} and end with C{$}. This is true
111 whether we are matching a complete path or a basename.
112
113 @note: Some platforms, like Windows, do not support soft links. On those
114 platforms, the ignore-soft-links flag can be set, but it won't do any good
115 because the operating system never reports a file as a soft link.
116
117 @sort: __init__, addFile, addDir, addDirContents, removeFiles, removeDirs,
118 removeLinks, removeMatch, removeInvalid, normalize, validate,
119 excludeFiles, excludeDirs, excludeLinks, excludePaths,
120 excludePatterns, excludeBasenamePatterns, ignoreFile
121 """
122
123
124
125
126
127
145
146
147
148
149
150
152 """
153 Property target used to set the exclude files flag.
154 No validations, but we normalize the value to C{True} or C{False}.
155 """
156 if value:
157 self._excludeFiles = True
158 else:
159 self._excludeFiles = False
160
162 """
163 Property target used to get the exclude files flag.
164 """
165 return self._excludeFiles
166
168 """
169 Property target used to set the exclude directories flag.
170 No validations, but we normalize the value to C{True} or C{False}.
171 """
172 if value:
173 self._excludeDirs = True
174 else:
175 self._excludeDirs = False
176
178 """
179 Property target used to get the exclude directories flag.
180 """
181 return self._excludeDirs
182
184 """
185 Property target used to set the exclude soft links flag.
186 No validations, but we normalize the value to C{True} or C{False}.
187 """
188 if value:
189 self._excludeLinks = True
190 else:
191 self._excludeLinks = False
192
194 """
195 Property target used to get the exclude soft links flag.
196 """
197 return self._excludeLinks
198
200 """
201 Property target used to set the exclude paths list.
202 A C{None} value is converted to an empty list.
203 Elements do not have to exist on disk at the time of assignment.
204 @raise ValueError: If any list element is not an absolute path.
205 """
206 self._absoluteExcludePaths = AbsolutePathList()
207 if value is not None:
208 self._absoluteExcludePaths.extend(value)
209
211 """
212 Property target used to get the absolute exclude paths list.
213 """
214 return self._absoluteExcludePaths
215
217 """
218 Property target used to set the exclude patterns list.
219 A C{None} value is converted to an empty list.
220 """
221 self._excludePatterns = RegexList()
222 if value is not None:
223 self._excludePatterns.extend(value)
224
226 """
227 Property target used to get the exclude patterns list.
228 """
229 return self._excludePatterns
230
232 """
233 Property target used to set the exclude basename patterns list.
234 A C{None} value is converted to an empty list.
235 """
236 self._excludeBasenamePatterns = RegexList()
237 if value is not None:
238 self._excludeBasenamePatterns.extend(value)
239
241 """
242 Property target used to get the exclude basename patterns list.
243 """
244 return self._excludeBasenamePatterns
245
247 """
248 Property target used to set the ignore file.
249 The value must be a non-empty string if it is not C{None}.
250 @raise ValueError: If the value is an empty string.
251 """
252 if value is not None:
253 if len(value) < 1:
254 raise ValueError("The ignore file must be a non-empty string.")
255 self._ignoreFile = value
256
258 """
259 Property target used to get the ignore file.
260 """
261 return self._ignoreFile
262
263 excludeFiles = property(_getExcludeFiles, _setExcludeFiles, None, "Boolean indicating whether files should be excluded.")
264 excludeDirs = property(_getExcludeDirs, _setExcludeDirs, None, "Boolean indicating whether directories should be excluded.")
265 excludeLinks = property(_getExcludeLinks, _setExcludeLinks, None, "Boolean indicating whether soft links should be excluded.")
266 excludePaths = property(_getExcludePaths, _setExcludePaths, None, "List of absolute paths to be excluded.")
267 excludePatterns = property(_getExcludePatterns, _setExcludePatterns, None,
268 "List of regular expression patterns (matching complete path) to be excluded.")
269 excludeBasenamePatterns = property(_getExcludeBasenamePatterns, _setExcludeBasenamePatterns,
270 None, "List of regular expression patterns (matching basename) to be excluded.")
271 ignoreFile = property(_getIgnoreFile, _setIgnoreFile, None, "Name of file which will cause directory contents to be ignored.")
272
273
274
275
276
277
279 """
280 Adds a file to the list.
281
282 The path must exist and must be a file or a link to an existing file. It
283 will be added to the list subject to any exclusions that are in place.
284
285 @param path: File path to be added to the list
286 @type path: String representing a path on disk
287
288 @return: Number of items added to the list.
289
290 @raise ValueError: If path is not a file or does not exist.
291 @raise ValueError: If the path could not be encoded properly.
292 """
293 path = encodePath(path)
294 if not os.path.exists(path) or not os.path.isfile(path):
295 logger.debug("Path [%s] is not a file or does not exist on disk." % path)
296 raise ValueError("Path is not a file or does not exist on disk.")
297 if self.excludeLinks and os.path.islink(path):
298 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
299 return 0
300 if self.excludeFiles:
301 logger.debug("Path [%s] is excluded based on excludeFiles." % path)
302 return 0
303 if path in self.excludePaths:
304 logger.debug("Path [%s] is excluded based on excludePaths." % path)
305 return 0
306 for pattern in self.excludePatterns:
307 pattern = encodePath(pattern)
308 if re.compile(r"^%s$" % pattern).match(path):
309 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
310 return 0
311 for pattern in self.excludeBasenamePatterns:
312 pattern = encodePath(pattern)
313 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
314 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
315 return 0
316 self.append(path)
317 logger.debug("Added file to list: [%s]" % path)
318 return 1
319
321 """
322 Adds a directory to the list.
323
324 The path must exist and must be a directory or a link to an existing
325 directory. It will be added to the list subject to any exclusions that
326 are in place. The L{ignoreFile} does not apply to this method, only to
327 L{addDirContents}.
328
329 @param path: Directory path to be added to the list
330 @type path: String representing a path on disk
331
332 @return: Number of items added to the list.
333
334 @raise ValueError: If path is not a directory or does not exist.
335 @raise ValueError: If the path could not be encoded properly.
336 """
337 path = encodePath(path)
338 path = normalizeDir(path)
339 if not os.path.exists(path) or not os.path.isdir(path):
340 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
341 raise ValueError("Path is not a directory or does not exist on disk.")
342 if self.excludeLinks and os.path.islink(path):
343 logger.debug("Path [%s] is excluded based on excludeLinks." % path)
344 return 0
345 if self.excludeDirs:
346 logger.debug("Path [%s] is excluded based on excludeDirs." % path)
347 return 0
348 if path in self.excludePaths:
349 logger.debug("Path [%s] is excluded based on excludePaths." % path)
350 return 0
351 for pattern in self.excludePatterns:
352 pattern = encodePath(pattern)
353 if re.compile(r"^%s$" % pattern).match(path):
354 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
355 return 0
356 for pattern in self.excludeBasenamePatterns:
357 pattern = encodePath(pattern)
358 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
359 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
360 return 0
361 self.append(path)
362 logger.debug("Added directory to list: [%s]" % path)
363 return 1
364
365 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
366 """
367 Adds the contents of a directory to the list.
368
369 The path must exist and must be a directory or a link to a directory.
370 The contents of the directory (as well as the directory path itself) will
371 be recursively added to the list, subject to any exclusions that are in
372 place. If you only want the directory and its immediate contents to be
373 added, then pass in C{recursive=False}.
374
375 @note: If a directory's absolute path matches an exclude pattern or path,
376 or if the directory contains the configured ignore file, then the
377 directory and all of its contents will be recursively excluded from the
378 list.
379
380 @note: If the passed-in directory happens to be a soft link, it will be
381 recursed. However, the linkDepth parameter controls whether any soft
382 links I{within} the directory will be recursed. The link depth is
383 maximum depth of the tree at which soft links should be followed. So, a
384 depth of 0 does not follow any soft links, a depth of 1 follows only
385 links within the passed-in directory, a depth of 2 follows the links at
386 the next level down, etc.
387
388 @note: Any invalid soft links (i.e. soft links that point to
389 non-existent items) will be silently ignored.
390
391 @note: The L{excludeDirs} flag only controls whether any given directory
392 path itself is added to the list once it has been discovered. It does
393 I{not} modify any behavior related to directory recursion.
394
395 @note: If you call this method I{on a link to a directory} that link will
396 never be dereferenced (it may, however, be followed).
397
398 @param path: Directory path whose contents should be added to the list
399 @type path: String representing a path on disk
400
401 @param recursive: Indicates whether directory contents should be added recursively.
402 @type recursive: Boolean value
403
404 @param addSelf: Indicates whether the directory itself should be added to the list.
405 @type addSelf: Boolean value
406
407 @param linkDepth: Maximum depth of the tree at which soft links should be followed
408 @type linkDepth: Integer value, where zero means not to follow any soft links
409
410 @param dereference: Indicates whether soft links, if followed, should be dereferenced
411 @type dereference: Boolean value
412
413 @return: Number of items recursively added to the list
414
415 @raise ValueError: If path is not a directory or does not exist.
416 @raise ValueError: If the path could not be encoded properly.
417 """
418 path = encodePath(path)
419 path = normalizeDir(path)
420 return self._addDirContentsInternal(path, addSelf, recursive, linkDepth, dereference)
421
422 - def _addDirContentsInternal(self, path, includePath=True, recursive=True, linkDepth=0, dereference=False):
423 """
424 Internal implementation of C{addDirContents}.
425
426 This internal implementation exists due to some refactoring. Basically,
427 some subclasses have a need to add the contents of a directory, but not
428 the directory itself. This is different than the standard C{FilesystemList}
429 behavior and actually ends up making a special case out of the first
430 call in the recursive chain. Since I don't want to expose the modified
431 interface, C{addDirContents} ends up being wholly implemented in terms
432 of this method.
433
434 The linkDepth parameter controls whether soft links are followed when we
435 are adding the contents recursively. Any recursive calls reduce the
436 value by one. If the value zero or less, then soft links will just be
437 added as directories, but will not be followed. This means that links
438 are followed to a I{constant depth} starting from the top-most directory.
439
440 There is one difference between soft links and directories: soft links
441 that are added recursively are not placed into the list explicitly. This
442 is because if we do add the links recursively, the resulting tar file
443 gets a little confused (it has a link and a directory with the same
444 name).
445
446 @note: If you call this method I{on a link to a directory} that link will
447 never be dereferenced (it may, however, be followed).
448
449 @param path: Directory path whose contents should be added to the list.
450 @param includePath: Indicates whether to include the path as well as contents.
451 @param recursive: Indicates whether directory contents should be added recursively.
452 @param linkDepth: Depth of soft links that should be followed
453 @param dereference: Indicates whether soft links, if followed, should be dereferenced
454
455 @return: Number of items recursively added to the list
456
457 @raise ValueError: If path is not a directory or does not exist.
458 """
459 added = 0
460 if not os.path.exists(path) or not os.path.isdir(path):
461 logger.debug("Path [%s] is not a directory or does not exist on disk." % path)
462 raise ValueError("Path is not a directory or does not exist on disk.")
463 if path in self.excludePaths:
464 logger.debug("Path [%s] is excluded based on excludePaths." % path)
465 return added
466 for pattern in self.excludePatterns:
467 pattern = encodePath(pattern)
468 if re.compile(r"^%s$" % pattern).match(path):
469 logger.debug("Path [%s] is excluded based on pattern [%s]." % (path, pattern))
470 return added
471 for pattern in self.excludeBasenamePatterns:
472 pattern = encodePath(pattern)
473 if re.compile(r"^%s$" % pattern).match(os.path.basename(path)):
474 logger.debug("Path [%s] is excluded based on basename pattern [%s]." % (path, pattern))
475 return added
476 if self.ignoreFile is not None and os.path.exists(os.path.join(path, self.ignoreFile)):
477 logger.debug("Path [%s] is excluded based on ignore file." % path)
478 return added
479 if includePath:
480 added += self.addDir(path)
481 for entry in os.listdir(path):
482 entrypath = os.path.join(path, entry)
483 if os.path.isfile(entrypath):
484 if linkDepth > 0 and dereference:
485 derefpath = dereferenceLink(entrypath)
486 if derefpath != entrypath:
487 added += self.addFile(derefpath)
488 added += self.addFile(entrypath)
489 elif os.path.isdir(entrypath):
490 if os.path.islink(entrypath):
491 if recursive:
492 if linkDepth > 0:
493 newDepth = linkDepth - 1;
494 if dereference:
495 derefpath = dereferenceLink(entrypath)
496 if derefpath != entrypath:
497 added += self._addDirContentsInternal(derefpath, True, recursive, newDepth, dereference)
498 added += self.addDir(entrypath)
499 else:
500 added += self._addDirContentsInternal(entrypath, False, recursive, newDepth, dereference)
501 else:
502 added += self.addDir(entrypath)
503 else:
504 added += self.addDir(entrypath)
505 else:
506 if recursive:
507 newDepth = linkDepth - 1;
508 added += self._addDirContentsInternal(entrypath, True, recursive, newDepth, dereference)
509 else:
510 added += self.addDir(entrypath)
511 return added
512
513
514
515
516
517
519 """
520 Removes file entries from the list.
521
522 If C{pattern} is not passed in or is C{None}, then all file entries will
523 be removed from the list. Otherwise, only those file entries matching
524 the pattern will be removed. Any entry which does not exist on disk
525 will be ignored (use L{removeInvalid} to purge those entries).
526
527 This method might be fairly slow for large lists, since it must check the
528 type of each item in the list. If you know ahead of time that you want
529 to exclude all files, then you will be better off setting L{excludeFiles}
530 to C{True} before adding items to the list.
531
532 @param pattern: Regular expression pattern representing entries to remove
533
534 @return: Number of entries removed
535 @raise ValueError: If the passed-in pattern is not a valid regular expression.
536 """
537 removed = 0
538 if pattern is None:
539 for entry in self[:]:
540 if os.path.exists(entry) and os.path.isfile(entry):
541 self.remove(entry)
542 logger.debug("Removed path [%s] from list." % entry)
543 removed += 1
544 else:
545 try:
546 pattern = encodePath(pattern)
547 compiled = re.compile(pattern)
548 except re.error:
549 raise ValueError("Pattern is not a valid regular expression.")
550 for entry in self[:]:
551 if os.path.exists(entry) and os.path.isfile(entry):
552 if compiled.match(entry):
553 self.remove(entry)
554 logger.debug("Removed path [%s] from list." % entry)
555 removed += 1
556 logger.debug("Removed a total of %d entries." % removed);
557 return removed
558
560 """
561 Removes directory entries from the list.
562
563 If C{pattern} is not passed in or is C{None}, then all directory entries
564 will be removed from the list. Otherwise, only those directory entries
565 matching the pattern will be removed. Any entry which does not exist on
566 disk will be ignored (use L{removeInvalid} to purge those entries).
567
568 This method might be fairly slow for large lists, since it must check the
569 type of each item in the list. If you know ahead of time that you want
570 to exclude all directories, then you will be better off setting
571 L{excludeDirs} to C{True} before adding items to the list (note that this
572 will not prevent you from recursively adding the I{contents} of
573 directories).
574
575 @param pattern: Regular expression pattern representing entries to remove
576
577 @return: Number of entries removed
578 @raise ValueError: If the passed-in pattern is not a valid regular expression.
579 """
580 removed = 0
581 if pattern is None:
582 for entry in self[:]:
583 if os.path.exists(entry) and os.path.isdir(entry):
584 self.remove(entry)
585 logger.debug("Removed path [%s] from list." % entry)
586 removed += 1
587 else:
588 try:
589 pattern = encodePath(pattern)
590 compiled = re.compile(pattern)
591 except re.error:
592 raise ValueError("Pattern is not a valid regular expression.")
593 for entry in self[:]:
594 if os.path.exists(entry) and os.path.isdir(entry):
595 if compiled.match(entry):
596 self.remove(entry)
597 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
598 removed += 1
599 logger.debug("Removed a total of %d entries." % removed);
600 return removed
601
603 """
604 Removes soft link entries from the list.
605
606 If C{pattern} is not passed in or is C{None}, then all soft link entries
607 will be removed from the list. Otherwise, only those soft link entries
608 matching the pattern will be removed. Any entry which does not exist on
609 disk will be ignored (use L{removeInvalid} to purge those entries).
610
611 This method might be fairly slow for large lists, since it must check the
612 type of each item in the list. If you know ahead of time that you want
613 to exclude all soft links, then you will be better off setting
614 L{excludeLinks} to C{True} before adding items to the list.
615
616 @param pattern: Regular expression pattern representing entries to remove
617
618 @return: Number of entries removed
619 @raise ValueError: If the passed-in pattern is not a valid regular expression.
620 """
621 removed = 0
622 if pattern is None:
623 for entry in self[:]:
624 if os.path.exists(entry) and os.path.islink(entry):
625 self.remove(entry)
626 logger.debug("Removed path [%s] from list." % entry)
627 removed += 1
628 else:
629 try:
630 pattern = encodePath(pattern)
631 compiled = re.compile(pattern)
632 except re.error:
633 raise ValueError("Pattern is not a valid regular expression.")
634 for entry in self[:]:
635 if os.path.exists(entry) and os.path.islink(entry):
636 if compiled.match(entry):
637 self.remove(entry)
638 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
639 removed += 1
640 logger.debug("Removed a total of %d entries." % removed);
641 return removed
642
644 """
645 Removes from the list all entries matching a pattern.
646
647 This method removes from the list all entries which match the passed in
648 C{pattern}. Since there is no need to check the type of each entry, it
649 is faster to call this method than to call the L{removeFiles},
650 L{removeDirs} or L{removeLinks} methods individually. If you know which
651 patterns you will want to remove ahead of time, you may be better off
652 setting L{excludePatterns} or L{excludeBasenamePatterns} before adding
653 items to the list.
654
655 @note: Unlike when using the exclude lists, the pattern here is I{not}
656 bounded at the front and the back of the string. You can use any pattern
657 you want.
658
659 @param pattern: Regular expression pattern representing entries to remove
660
661 @return: Number of entries removed.
662 @raise ValueError: If the passed-in pattern is not a valid regular expression.
663 """
664 try:
665 pattern = encodePath(pattern)
666 compiled = re.compile(pattern)
667 except re.error:
668 raise ValueError("Pattern is not a valid regular expression.")
669 removed = 0
670 for entry in self[:]:
671 if compiled.match(entry):
672 self.remove(entry)
673 logger.debug("Removed path [%s] from list based on pattern [%s]." % (entry, pattern))
674 removed += 1
675 logger.debug("Removed a total of %d entries." % removed);
676 return removed
677
679 """
680 Removes from the list all entries that do not exist on disk.
681
682 This method removes from the list all entries which do not currently
683 exist on disk in some form. No attention is paid to whether the entries
684 are files or directories.
685
686 @return: Number of entries removed.
687 """
688 removed = 0
689 for entry in self[:]:
690 if not os.path.exists(entry):
691 self.remove(entry)
692 logger.debug("Removed path [%s] from list." % entry)
693 removed += 1
694 logger.debug("Removed a total of %d entries." % removed);
695 return removed
696
697
698
699
700
701
703 """Normalizes the list, ensuring that each entry is unique."""
704 orig = len(self)
705 self.sort()
706 dups = filter(lambda x, self=self: self[x] == self[x+1], range(0, len(self) - 1))
707 items = map(lambda x, self=self: self[x], dups)
708 map(self.remove, items)
709 new = len(self)
710 logger.debug("Completed normalizing list; removed %d items (%d originally, %d now)." % (new-orig, orig, new))
711
713 """
714 Verifies that all entries in the list exist on disk.
715 @return: C{True} if all entries exist, C{False} otherwise.
716 """
717 for entry in self:
718 if not os.path.exists(entry):
719 logger.debug("Path [%s] is invalid; list is not valid." % entry)
720 return False
721 logger.debug("All entries in list are valid.")
722 return True
723
724
725
726
727
728
730 """
731 Item returned by L{BackupFileList.generateSpan}.
732 """
733 - def __init__(self, fileList, size, capacity, utilization):
734 """
735 Create object.
736 @param fileList: List of files
737 @param size: Size (in bytes) of files
738 @param utilization: Utilization, as a percentage (0-100)
739 """
740 self.fileList = fileList
741 self.size = size
742 self.capacity = capacity
743 self.utilization = utilization
744
745
746
747
748
749
751
752
753
754
755
756 """
757 List of files to be backed up.
758
759 A BackupFileList is a L{FilesystemList} containing a list of files to be
760 backed up. It only contains files, not directories (soft links are treated
761 like files). On top of the generic functionality provided by
762 L{FilesystemList}, this class adds functionality to keep a hash (checksum)
763 for each file in the list, and it also provides a method to calculate the
764 total size of the files in the list and a way to export the list into tar
765 form.
766
767 @sort: __init__, addDir, totalSize, generateSizeMap, generateDigestMap,
768 generateFitted, generateTarfile, removeUnchanged
769 """
770
771
772
773
774
778
779
780
781
782
783
785 """
786 Adds a directory to the list.
787
788 Note that this class does not allow directories to be added by themselves
789 (a backup list contains only files). However, since links to directories
790 are technically files, we allow them to be added.
791
792 This method is implemented in terms of the superclass method, with one
793 additional validation: the superclass method is only called if the
794 passed-in path is both a directory and a link. All of the superclass's
795 existing validations and restrictions apply.
796
797 @param path: Directory path to be added to the list
798 @type path: String representing a path on disk
799
800 @return: Number of items added to the list.
801
802 @raise ValueError: If path is not a directory or does not exist.
803 @raise ValueError: If the path could not be encoded properly.
804 """
805 path = encodePath(path)
806 path = normalizeDir(path)
807 if os.path.isdir(path) and not os.path.islink(path):
808 return 0
809 else:
810 return FilesystemList.addDir(self, path)
811
812
813
814
815
816
818 """
819 Returns the total size among all files in the list.
820 Only files are counted.
821 Soft links that point at files are ignored.
822 Entries which do not exist on disk are ignored.
823 @return: Total size, in bytes
824 """
825 total = 0.0
826 for entry in self:
827 if os.path.isfile(entry) and not os.path.islink(entry):
828 total += float(os.stat(entry).st_size)
829 return total
830
832 """
833 Generates a mapping from file to file size in bytes.
834 The mapping does include soft links, which are listed with size zero.
835 Entries which do not exist on disk are ignored.
836 @return: Dictionary mapping file to file size
837 """
838 table = { }
839 for entry in self:
840 if os.path.islink(entry):
841 table[entry] = 0.0
842 elif os.path.isfile(entry):
843 table[entry] = float(os.stat(entry).st_size)
844 return table
845
847 """
848 Generates a mapping from file to file digest.
849
850 Currently, the digest is an SHA hash, which should be pretty secure. In
851 the future, this might be a different kind of hash, but we guarantee that
852 the type of the hash will not change unless the library major version
853 number is bumped.
854
855 Entries which do not exist on disk are ignored.
856
857 Soft links are ignored. We would end up generating a digest for the file
858 that the soft link points at, which doesn't make any sense.
859
860 If C{stripPrefix} is passed in, then that prefix will be stripped from
861 each key when the map is generated. This can be useful in generating two
862 "relative" digest maps to be compared to one another.
863
864 @param stripPrefix: Common prefix to be stripped from paths
865 @type stripPrefix: String with any contents
866
867 @return: Dictionary mapping file to digest value
868 @see: L{removeUnchanged}
869 """
870 table = { }
871 if stripPrefix is not None:
872 for entry in self:
873 if os.path.isfile(entry) and not os.path.islink(entry):
874 table[entry.replace(stripPrefix, "", 1)] = BackupFileList._generateDigest(entry)
875 else:
876 for entry in self:
877 if os.path.isfile(entry) and not os.path.islink(entry):
878 table[entry] = BackupFileList._generateDigest(entry)
879 return table
880
882 """
883 Generates an SHA digest for a given file on disk.
884
885 The original code for this function used this simplistic implementation,
886 which requires reading the entire file into memory at once in order to
887 generate a digest value::
888
889 sha.new(open(path).read()).hexdigest()
890
891 Not surprisingly, this isn't an optimal solution. The U{Simple file
892 hashing <http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/259109>}
893 Python Cookbook recipe describes how to incrementally generate a hash
894 value by reading in chunks of data rather than reading the file all at
895 once. The recipe relies on the the C{update()} method of the various
896 Python hashing algorithms.
897
898 In my tests using a 110 MB file on CD, the original implementation
899 requires 111 seconds. This implementation requires only 40-45 seconds,
900 which is a pretty substantial speed-up.
901
902 Practice shows that reading in around 4kB (4096 bytes) at a time yields
903 the best performance. Smaller reads are quite a bit slower, and larger
904 reads don't make much of a difference. The 4kB number makes me a little
905 suspicious, and I think it might be related to the size of a filesystem
906 read at the hardware level. However, I've decided to just hardcode 4096
907 until I have evidence that shows it's worthwhile making the read size
908 configurable.
909
910 @param path: Path to generate digest for.
911
912 @return: ASCII-safe SHA digest for the file.
913 @raise OSError: If the file cannot be opened.
914 """
915 s = sha.new()
916 f = open(path, mode="rb")
917 readBytes = 4096
918 while(readBytes > 0):
919 readString = f.read(readBytes)
920 s.update(readString)
921 readBytes = len(readString)
922 f.close()
923 digest = s.hexdigest()
924 logger.debug("Generated digest [%s] for file [%s]." % (digest, path))
925 return digest
926 _generateDigest = staticmethod(_generateDigest)
927
929 """
930 Generates a list of items that fit in the indicated capacity.
931
932 Sometimes, callers would like to include every item in a list, but are
933 unable to because not all of the items fit in the space available. This
934 method returns a copy of the list, containing only the items that fit in
935 a given capacity. A copy is returned so that we don't lose any
936 information if for some reason the fitted list is unsatisfactory.
937
938 The fitting is done using the functions in the knapsack module. By
939 default, the first fit algorithm is used, but you can also choose
940 from best fit, worst fit and alternate fit.
941
942 @param capacity: Maximum capacity among the files in the new list
943 @type capacity: Integer, in bytes
944
945 @param algorithm: Knapsack (fit) algorithm to use
946 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
947
948 @return: Copy of list with total size no larger than indicated capacity
949 @raise ValueError: If the algorithm is invalid.
950 """
951 table = self._getKnapsackTable()
952 function = BackupFileList._getKnapsackFunction(algorithm)
953 return function(table, capacity)[0]
954
956 """
957 Splits the list of items into sub-lists that fit in a given capacity.
958
959 Sometimes, callers need split to a backup file list into a set of smaller
960 lists. For instance, you could use this to "span" the files across a set
961 of discs.
962
963 The fitting is done using the functions in the knapsack module. By
964 default, the first fit algorithm is used, but you can also choose
965 from best fit, worst fit and alternate fit.
966
967 @note: If any of your items are larger than the capacity, then it won't
968 be possible to find a solution. In this case, a value error will be
969 raised.
970
971 @param capacity: Maximum capacity among the files in the new list
972 @type capacity: Integer, in bytes
973
974 @param algorithm: Knapsack (fit) algorithm to use
975 @type algorithm: One of "first_fit", "best_fit", "worst_fit", "alternate_fit"
976
977 @return: List of L{SpanItem} objects.
978
979 @raise ValueError: If the algorithm is invalid.
980 @raise ValueError: If it's not possible to fit some items
981 """
982 spanItems = []
983 function = BackupFileList._getKnapsackFunction(algorithm)
984 table = self._getKnapsackTable(capacity)
985 iteration = 0
986 while len(table) > 0:
987 iteration += 1
988 fit = function(table, capacity)
989 if len(fit[0]) == 0:
990
991 raise ValueError("After iteration %d, unable to add any new items." % iteration)
992 removeKeys(table, fit[0])
993 utilization = (float(fit[1])/float(capacity))*100.0
994 item = SpanItem(fit[0], fit[1], capacity, utilization)
995 spanItems.append(item)
996 return spanItems
997
999 """
1000 Converts the list into the form needed by the knapsack algorithms.
1001 @return: Dictionary mapping file name to tuple of (file path, file size).
1002 """
1003 table = { }
1004 for entry in self:
1005 if os.path.islink(entry):
1006 table[entry] = (entry, 0.0)
1007 elif os.path.isfile(entry):
1008 size = float(os.stat(entry).st_size)
1009 if capacity is not None:
1010 if size > capacity:
1011 raise ValueError("File [%s] cannot fit in capacity %s." % (entry, displayBytes(capacity)))
1012 table[entry] = (entry, size)
1013 return table
1014
1016 """
1017 Returns a reference to the function associated with an algorithm name.
1018 Algorithm name must be one of "first_fit", "best_fit", "worst_fit", "alternate_fit"
1019 @param algorithm: Name of the algorithm
1020 @return: Reference to knapsack function
1021 @raise ValueError: If the algorithm name is unknown.
1022 """
1023 if algorithm == "first_fit":
1024 return firstFit
1025 elif algorithm == "best_fit":
1026 return bestFit
1027 elif algorithm == "worst_fit":
1028 return worstFit
1029 elif algorithm == "alternate_fit":
1030 return alternateFit
1031 else:
1032 raise ValueError("Algorithm [%s] is invalid." % algorithm);
1033 _getKnapsackFunction = staticmethod(_getKnapsackFunction)
1034
1036 """
1037 Creates a tar file containing the files in the list.
1038
1039 By default, this method will create uncompressed tar files. If you pass
1040 in mode C{'targz'}, then it will create gzipped tar files, and if you
1041 pass in mode C{'tarbz2'}, then it will create bzipped tar files.
1042
1043 The tar file will be created as a GNU tar archive, which enables extended
1044 file name lengths, etc. Since GNU tar is so prevalent, I've decided that
1045 the extra functionality out-weighs the disadvantage of not being
1046 "standard".
1047
1048 If you pass in C{flat=True}, then a "flat" archive will be created, and
1049 all of the files will be added to the root of the archive. So, the file
1050 C{/tmp/something/whatever.txt} would be added as just C{whatever.txt}.
1051
1052 By default, the whole method call fails if there are problems adding any
1053 of the files to the archive, resulting in an exception. Under these
1054 circumstances, callers are advised that they might want to call
1055 L{removeInvalid()} and then attempt to extract the tar file a second
1056 time, since the most common cause of failures is a missing file (a file
1057 that existed when the list was built, but is gone again by the time the
1058 tar file is built).
1059
1060 If you want to, you can pass in C{ignore=True}, and the method will
1061 ignore errors encountered when adding individual files to the archive
1062 (but not errors opening and closing the archive itself).
1063
1064 We'll always attempt to remove the tarfile from disk if an exception will
1065 be thrown.
1066
1067 @note: No validation is done as to whether the entries in the list are
1068 files, since only files or soft links should be in an object like this.
1069 However, to be safe, everything is explicitly added to the tar archive
1070 non-recursively so it's safe to include soft links to directories.
1071
1072 @note: The Python C{tarfile} module, which is used internally here, is
1073 supposed to deal properly with long filenames and links. In my testing,
1074 I have found that it appears to be able to add long really long filenames
1075 to archives, but doesn't do a good job reading them back out, even out of
1076 an archive it created. Fortunately, all Cedar Backup does is add files
1077 to archives.
1078
1079 @param path: Path of tar file to create on disk
1080 @type path: String representing a path on disk
1081
1082 @param mode: Tar creation mode
1083 @type mode: One of either C{'tar'}, C{'targz'} or C{'tarbz2'}
1084
1085 @param ignore: Indicates whether to ignore certain errors.
1086 @type ignore: Boolean
1087
1088 @param flat: Creates "flat" archive by putting all items in root
1089 @type flat: Boolean
1090
1091 @raise ValueError: If mode is not valid
1092 @raise ValueError: If list is empty
1093 @raise ValueError: If the path could not be encoded properly.
1094 @raise TarError: If there is a problem creating the tar file
1095 """
1096 path = encodePath(path)
1097 if len(self) == 0: raise ValueError("Empty list cannot be used to generate tarfile.")
1098 if(mode == 'tar'): tarmode = "w:"
1099 elif(mode == 'targz'): tarmode = "w:gz"
1100 elif(mode == 'tarbz2'): tarmode = "w:bz2"
1101 else: raise ValueError("Mode [%s] is not valid." % mode)
1102 try:
1103 tar = tarfile.open(path, tarmode)
1104 tar.posix = False
1105 for entry in self:
1106 try:
1107 if flat:
1108 tar.add(entry, arcname=os.path.basename(entry), recursive=False)
1109 else:
1110 tar.add(entry, recursive=False)
1111 except tarfile.TarError, e:
1112 if not ignore:
1113 raise e
1114 logger.info("Unable to add file [%s]; going on anyway." % entry)
1115 except OSError, e:
1116 if not ignore:
1117 raise tarfile.TarError(e)
1118 logger.info("Unable to add file [%s]; going on anyway." % entry)
1119 tar.close()
1120 except tarfile.ReadError, e:
1121 try: tar.close()
1122 except: pass
1123 if os.path.exists(path):
1124 try: os.remove(path)
1125 except: pass
1126 raise tarfile.ReadError("Unable to open [%s]; maybe directory doesn't exist?" % path)
1127 except tarfile.TarError, e:
1128 try: tar.close()
1129 except: pass
1130 if os.path.exists(path):
1131 try: os.remove(path)
1132 except: pass
1133 raise e
1134
1136 """
1137 Removes unchanged entries from the list.
1138
1139 This method relies on a digest map as returned from L{generateDigestMap}.
1140 For each entry in C{digestMap}, if the entry also exists in the current
1141 list I{and} the entry in the current list has the same digest value as in
1142 the map, the entry in the current list will be removed.
1143
1144 This method offers a convenient way for callers to filter unneeded
1145 entries from a list. The idea is that a caller will capture a digest map
1146 from C{generateDigestMap} at some point in time (perhaps the beginning of
1147 the week), and will save off that map using C{pickle} or some other
1148 method. Then, the caller could use this method sometime in the future to
1149 filter out any unchanged files based on the saved-off map.
1150
1151 If C{captureDigest} is passed-in as C{True}, then digest information will
1152 be captured for the entire list before the removal step occurs using the
1153 same rules as in L{generateDigestMap}. The check will involve a lookup
1154 into the complete digest map.
1155
1156 If C{captureDigest} is passed in as C{False}, we will only generate a
1157 digest value for files we actually need to check, and we'll ignore any
1158 entry in the list which isn't a file that currently exists on disk.
1159
1160 The return value varies depending on C{captureDigest}, as well. To
1161 preserve backwards compatibility, if C{captureDigest} is C{False}, then
1162 we'll just return a single value representing the number of entries
1163 removed. Otherwise, we'll return a tuple of C{(entries removed, digest
1164 map)}. The returned digest map will be in exactly the form returned by
1165 L{generateDigestMap}.
1166
1167 @note: For performance reasons, this method actually ends up rebuilding
1168 the list from scratch. First, we build a temporary dictionary containing
1169 all of the items from the original list. Then, we remove items as needed
1170 from the dictionary (which is faster than the equivalent operation on a
1171 list). Finally, we replace the contents of the current list based on the
1172 keys left in the dictionary. This should be transparent to the caller.
1173
1174 @param digestMap: Dictionary mapping file name to digest value.
1175 @type digestMap: Map as returned from L{generateDigestMap}.
1176
1177 @param captureDigest: Indicates that digest information should be captured.
1178 @type captureDigest: Boolean
1179
1180 @return: Number of entries removed
1181 """
1182 if captureDigest:
1183 removed = 0
1184 table = {}
1185 captured = {}
1186 for entry in self:
1187 if os.path.isfile(entry) and not os.path.islink(entry):
1188 table[entry] = BackupFileList._generateDigest(entry)
1189 captured[entry] = table[entry]
1190 else:
1191 table[entry] = None
1192 for entry in digestMap.keys():
1193 if table.has_key(entry):
1194 if table[entry] is not None:
1195 digest = table[entry]
1196 if digest == digestMap[entry]:
1197 removed += 1
1198 del table[entry]
1199 logger.debug("Discarded unchanged file [%s]." % entry)
1200 self[:] = table.keys()
1201 return (removed, captured)
1202 else:
1203 removed = 0
1204 table = {}
1205 for entry in self:
1206 table[entry] = None
1207 for entry in digestMap.keys():
1208 if table.has_key(entry):
1209 if os.path.isfile(entry) and not os.path.islink(entry):
1210 digest = BackupFileList._generateDigest(entry)
1211 if digest == digestMap[entry]:
1212 removed += 1
1213 del table[entry]
1214 logger.debug("Discarded unchanged file [%s]." % entry)
1215 self[:] = table.keys()
1216 return removed
1217
1218
1219
1220
1221
1222
1224
1225
1226
1227
1228
1229 """
1230 List of files and directories to be purged.
1231
1232 A PurgeItemList is a L{FilesystemList} containing a list of files and
1233 directories to be purged. On top of the generic functionality provided by
1234 L{FilesystemList}, this class adds functionality to remove items that are
1235 too young to be purged, and to actually remove each item in the list from
1236 the filesystem.
1237
1238 The other main difference is that when you add a directory's contents to a
1239 purge item list, the directory itself is not added to the list. This way,
1240 if someone asks to purge within in C{/opt/backup/collect}, that directory
1241 doesn't get removed once all of the files within it is gone.
1242 """
1243
1244
1245
1246
1247
1251
1252
1253
1254
1255
1256
1257 - def addDirContents(self, path, recursive=True, addSelf=True, linkDepth=0, dereference=False):
1258 """
1259 Adds the contents of a directory to the list.
1260
1261 The path must exist and must be a directory or a link to a directory.
1262 The contents of the directory (but I{not} the directory path itself) will
1263 be recursively added to the list, subject to any exclusions that are in
1264 place. If you only want the directory and its contents to be added, then
1265 pass in C{recursive=False}.
1266
1267 @note: If a directory's absolute path matches an exclude pattern or path,
1268 or if the directory contains the configured ignore file, then the
1269 directory and all of its contents will be recursively excluded from the
1270 list.
1271
1272 @note: If the passed-in directory happens to be a soft link, it will be
1273 recursed. However, the linkDepth parameter controls whether any soft
1274 links I{within} the directory will be recursed. The link depth is
1275 maximum depth of the tree at which soft links should be followed. So, a
1276 depth of 0 does not follow any soft links, a depth of 1 follows only
1277 links within the passed-in directory, a depth of 2 follows the links at
1278 the next level down, etc.
1279
1280 @note: Any invalid soft links (i.e. soft links that point to
1281 non-existent items) will be silently ignored.
1282
1283 @note: The L{excludeDirs} flag only controls whether any given soft link
1284 path itself is added to the list once it has been discovered. It does
1285 I{not} modify any behavior related to directory recursion.
1286
1287 @note: The L{excludeDirs} flag only controls whether any given directory
1288 path itself is added to the list once it has been discovered. It does
1289 I{not} modify any behavior related to directory recursion.
1290
1291 @note: If you call this method I{on a link to a directory} that link will
1292 never be dereferenced (it may, however, be followed).
1293
1294 @param path: Directory path whose contents should be added to the list
1295 @type path: String representing a path on disk
1296
1297 @param recursive: Indicates whether directory contents should be added recursively.
1298 @type recursive: Boolean value
1299
1300 @param addSelf: Ignored in this subclass.
1301
1302 @param linkDepth: Depth of soft links that should be followed
1303 @type linkDepth: Integer value, where zero means not to follow any soft links
1304
1305 @param dereference: Indicates whether soft links, if followed, should be dereferenced
1306 @type dereference: Boolean value
1307
1308 @return: Number of items recursively added to the list
1309
1310 @raise ValueError: If path is not a directory or does not exist.
1311 @raise ValueError: If the path could not be encoded properly.
1312 """
1313 path = encodePath(path)
1314 path = normalizeDir(path)
1315 return super(PurgeItemList, self)._addDirContentsInternal(path, False, recursive, linkDepth, dereference)
1316
1317
1318
1319
1320
1321
1323 """
1324 Removes from the list files younger than a certain age (in days).
1325
1326 Any file whose "age" in days is less than (C{<}) the value of the
1327 C{daysOld} parameter will be removed from the list so that it will not be
1328 purged later when L{purgeItems} is called. Directories and soft links
1329 will be ignored.
1330
1331 The "age" of a file is the amount of time since the file was last used,
1332 per the most recent of the file's C{st_atime} and C{st_mtime} values.
1333
1334 @note: Some people find the "sense" of this method confusing or
1335 "backwards". Keep in mind that this method is used to remove items
1336 I{from the list}, not from the filesystem! It removes from the list
1337 those items that you would I{not} want to purge because they are too
1338 young. As an example, passing in C{daysOld} of zero (0) would remove
1339 from the list no files, which would result in purging all of the files
1340 later. I would be happy to make a synonym of this method with an
1341 easier-to-understand "sense", if someone can suggest one.
1342
1343 @param daysOld: Minimum age of files that are to be kept in the list.
1344 @type daysOld: Integer value >= 0.
1345
1346 @return: Number of entries removed
1347 """
1348 removed = 0
1349 daysOld = int(daysOld)
1350 if daysOld < 0:
1351 raise ValueError("Days old value must be an integer >= 0.")
1352 for entry in self[:]:
1353 if os.path.isfile(entry) and not os.path.islink(entry):
1354 try:
1355 ageInDays = calculateFileAge(entry)
1356 ageInWholeDays = math.floor(ageInDays)
1357 if ageInWholeDays < daysOld:
1358 removed += 1
1359 self.remove(entry)
1360 except OSError:
1361 pass
1362 return removed
1363
1365 """
1366 Purges all items in the list.
1367
1368 Every item in the list will be purged. Directories in the list will
1369 I{not} be purged recursively, and hence will only be removed if they are
1370 empty. Errors will be ignored.
1371
1372 To faciliate easy removal of directories that will end up being empty,
1373 the delete process happens in two passes: files first (including soft
1374 links), then directories.
1375
1376 @return: Tuple containing count of (files, dirs) removed
1377 """
1378 files = 0
1379 dirs = 0
1380 for entry in self:
1381 if os.path.exists(entry) and (os.path.isfile(entry) or os.path.islink(entry)):
1382 try:
1383 os.remove(entry)
1384 files += 1
1385 logger.debug("Purged file [%s]." % entry)
1386 except OSError:
1387 pass
1388 for entry in self:
1389 if os.path.exists(entry) and os.path.isdir(entry) and not os.path.islink(entry):
1390 try:
1391 os.rmdir(entry)
1392 dirs += 1
1393 logger.debug("Purged empty directory [%s]." % entry)
1394 except OSError:
1395 pass
1396 return (files, dirs)
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1408 """
1409 Normalizes a directory name.
1410
1411 For our purposes, a directory name is normalized by removing the trailing
1412 path separator, if any. This is important because we want directories to
1413 appear within lists in a consistent way, although from the user's
1414 perspective passing in C{/path/to/dir/} and C{/path/to/dir} are equivalent.
1415
1416 @param path: Path to be normalized.
1417 @type path: String representing a path on disk
1418
1419 @return: Normalized path, which should be equivalent to the original.
1420 """
1421 if path != os.sep and path[-1:] == os.sep:
1422 return path[:-1]
1423 return path
1424
1425
1426
1427
1428
1429
1430 -def compareContents(path1, path2, verbose=False):
1431 """
1432 Compares the contents of two directories to see if they are equivalent.
1433
1434 The two directories are recursively compared. First, we check whether they
1435 contain exactly the same set of files. Then, we check to see every given
1436 file has exactly the same contents in both directories.
1437
1438 This is all relatively simple to implement through the magic of
1439 L{BackupFileList.generateDigestMap}, which knows how to strip a path prefix
1440 off the front of each entry in the mapping it generates. This makes our
1441 comparison as simple as creating a list for each path, then generating a
1442 digest map for each path and comparing the two.
1443
1444 If no exception is thrown, the two directories are considered identical.
1445
1446 If the C{verbose} flag is C{True}, then an alternate (but slower) method is
1447 used so that any thrown exception can indicate exactly which file caused the
1448 comparison to fail. The thrown C{ValueError} exception distinguishes
1449 between the directories containing different files, and containing the same
1450 files with differing content.
1451
1452 @note: Symlinks are I{not} followed for the purposes of this comparison.
1453
1454 @param path1: First path to compare.
1455 @type path1: String representing a path on disk
1456
1457 @param path2: First path to compare.
1458 @type path2: String representing a path on disk
1459
1460 @param verbose: Indicates whether a verbose response should be given.
1461 @type verbose: Boolean
1462
1463 @raise ValueError: If a directory doesn't exist or can't be read.
1464 @raise ValueError: If the two directories are not equivalent.
1465 @raise IOError: If there is an unusual problem reading the directories.
1466 """
1467 try:
1468 path1List = BackupFileList()
1469 path1List.addDirContents(path1)
1470 path1Digest = path1List.generateDigestMap(stripPrefix=normalizeDir(path1))
1471 path2List = BackupFileList()
1472 path2List.addDirContents(path2)
1473 path2Digest = path2List.generateDigestMap(stripPrefix=normalizeDir(path2))
1474 compareDigestMaps(path1Digest, path2Digest, verbose)
1475 except IOError, e:
1476 logger.error("I/O error encountered during consistency check.")
1477 raise e
1478
1480 """
1481 Compares two digest maps and throws an exception if they differ.
1482
1483 @param digest1: First digest to compare.
1484 @type digest1: Digest as returned from BackupFileList.generateDigestMap()
1485
1486 @param digest2: Second digest to compare.
1487 @type digest2: Digest as returned from BackupFileList.generateDigestMap()
1488
1489 @param verbose: Indicates whether a verbose response should be given.
1490 @type verbose: Boolean
1491
1492 @raise ValueError: If the two directories are not equivalent.
1493 """
1494 if not verbose:
1495 if digest1 != digest2:
1496 raise ValueError("Consistency check failed.")
1497 else:
1498 list1 = UnorderedList(digest1.keys())
1499 list2 = UnorderedList(digest2.keys())
1500 if list1 != list2:
1501 raise ValueError("Directories contain a different set of files.")
1502 for key in list1:
1503 if digest1[key] != digest2[key]:
1504 raise ValueError("File contents for [%s] vary between directories." % key)
1505