Comments:<br><br><div class="gmail_quote">On Mon, Jul 14, 2008 at 4:05 PM, Michael Hanselmann <public@hansmi.ch> wrote:<br><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
- Reduced I/O load by only reading its content only once<br>
- Easier error handling (before a file could disappear between two<br>
calls to open())<br>
---<br>
lib/Brackup/File.pm | 92 +++++++++++++++++++++-------------------<br>
lib/Brackup/PositionedChunk.pm | 47 ++-------------------<br>
2 files changed, 52 insertions(+), 87 deletions(-)<br>
<br>
diff --git a/lib/Brackup/File.pm b/lib/Brackup/File.pm<br>
index 827eff1..f24907c 100644<br>
--- a/lib/Brackup/File.pm<br>
+++ b/lib/Brackup/File.pm<br>
@@ -30,6 +30,8 @@ sub new {<br>
die "No path provided." unless $self->{path};<br>
$self->{path} =~ s!^\./!!;<br>
<br>
+ $self->_read_file();</blockquote><div><br>I prefer style of no parens on nullary method call.<br> </div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
<br>
+<br>
return $self;<br>
}<br>
<br>
@@ -119,18 +121,22 @@ sub find_file_chunker {<br>
<br>
sub chunks {<br>
my $self = shift;<br>
- # memoized:<br>
- return @{ $self->{chunks} } if $self->{chunks};<br>
+ return @{$self->{chunks}};<br>
+}<br>
+<br>
+sub _read_file {<br>
+ our ($self) = @_;</blockquote><div><br>Wrong use of our here. You want my.<br> </div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
# non-files don't have chunks<br>
- if (!$self->is_file) {<br>
+ unless ($self->is_file) {</blockquote><div><br>I love unless, but I prefer to use it only for exceptional cases, or when it just reads better. (the word "unless" itself reads like an exceptional case) I'd keep this with "if not". "If it's not a file" is a common case.<br>
<br><br></div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;"> $self->{chunks} = [];<br>
- return ();<br>
+ return;<br>
}</blockquote><div><br>This function is called in list context. I like to stay explicit about that in my return type. I don't want to rely on implicit casts.<br> <br></div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
- # Get the appropriate FileChunker for this file type,<br>
- # then pass ourselves to it to get our chunks.<br>
- my @chunk_list;<br>
+ my $fullpath = $self->fullpath;<br>
+ open(my $fh, '<', $fullpath)<br>
+ or die "Failed to open $fullpath: $!\n";<br>
+ binmode($fh);<br>
<br>
# Get the appropriate FileChunker for this file type<br>
my $chunker = $self->find_file_chunker();<br>
@@ -139,6 +145,11 @@ sub chunks {<br>
my $last_offset = undef;<br>
my $last_len = undef;<br>
<br>
+ my $sha1_file = new Digest::SHA1;<br>
+ my $sha1_chunk = new Digest::SHA1;<br>
+<br>
+ my @chunks;<br>
+<br>
$chunker->get_offsets($self, sub {<br>
my ($offset, $len) = @_;<br>
<br>
@@ -164,10 +175,31 @@ sub chunks {<br>
die "Chunker didn't start at offset 0";<br>
}<br>
<br>
- push @chunk_list, new Brackup::PositionedChunk(</blockquote><div><br>Brackup::PositionedChunk->new<br> </div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
<br>
+ # Optimize for continous reading<br>
+ unless (tell($fh) == $offset) {<br>
+ seek($fh, $offset, 0) or die "Couldn't seek: $!\n";<br>
+ }<br>
+<br>
+ my $data;<br>
+ my $rv = read($fh, $data, $len) or die "Failed to read: $!\n";<br>
+ unless ($rv == $len) {<br>
+ Carp::confess("Read $rv bytes, not $len");<br>
+ }<br>
+<br>
+ # Update digest for complete file<br>
+ $sha1_file->add($data);<br>
+<br>
+ # Generate chunk digest<br>
+ $sha1_chunk->reset();</blockquote><div><br>->reset;<br> <br></div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;"><br>
+ $sha1_chunk->add($data);<br>
+ my $chunk_digest = 'sha1:' . $sha1_chunk->hexdigest();</blockquote><div><br>->hexdigest;<br> </div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;">
<br>
+<br>
+ push @chunks, new Brackup::PositionedChunk(<br>
file => $self,<br>
offset => $offset,<br>
length => $len,<br>
+ chunkref => \$data,<br>
+ digest => $chunk_digest,<br>
);</blockquote><div><br>Uh, you do this unconditionally? Not dependent on the size of the file!? How am I going to backup a huge file on a low-memory box?<br><br>Let's discuss that first before I review more.<br>
<br>- Brad<br><br><br> <br></div><blockquote class="gmail_quote" style="border-left: 1px solid rgb(204, 204, 204); margin: 0pt 0pt 0pt 0.8ex; padding-left: 1ex;"> $last_offset = $offset;<br>
@@ -178,46 +210,18 @@ sub chunks {<br>
die "Chunker didn't return complete file";<br>
}<br>
<br>
- $self->{chunks} = \@chunk_list;<br>
- return @chunk_list;<br>
-}<br>
+ my $file_digest = 'sha1:' . $sha1_file->hexdigest();<br>
<br>
-sub full_digest {<br>
- my $self = shift;<br>
- return $self->{_full_digest} ||= $self->_calc_full_digest;<br>
-}<br>
-<br>
-sub _calc_full_digest {<br>
- my $self = shift;<br>
- return "" unless $self->is_file;<br>
-<br>
- my $cache = $self->{root}->digest_cache;<br>
- my $key = $self->cachekey;<br>
-<br>
- my $dig = $cache->get($key);<br>
- return $dig if $dig;<br>
+ $self->{_full_digest} = $file_digest;<br>
<br>
- # legacy migration thing... we used to more often store<br>
- # the chunk digests, not the file digests. so try that<br>
- # first...<br>
- if ($self->chunks == 1) {<br>
- my ($chunk) = $self->chunks;<br>
- $dig = $cache->get($chunk->cachekey);<br>
- }<br>
-<br>
- unless ($dig) {<br>
- my $sha1 = Digest::SHA1->new;<br>
- my $path = $self->fullpath;<br>
- open (my $fh, $path) or die "Couldn't open $path: $!\n";<br>
- binmode($fh);<br>
- $sha1->addfile($fh);<br>
- close($fh);<br>
+ close($fh);<br>
<br>
- $dig = "sha1:" . $sha1->hexdigest;<br>
- }<br>
+ $self->{chunks} = \@chunks;<br>
+}<br>
<br>
- $cache->set($key => $dig);<br>
- return $dig;<br>
+sub full_digest {<br>
+ my $self = shift;<br>
+ return $self->{_full_digest};<br>
}<br>
<br>
sub link_target {<br>
diff --git a/lib/Brackup/PositionedChunk.pm b/lib/Brackup/PositionedChunk.pm<br>
index b5fabc1..8ed806c 100644<br>
--- a/lib/Brackup/PositionedChunk.pm<br>
+++ b/lib/Brackup/PositionedChunk.pm<br>
@@ -20,6 +20,8 @@ sub new {<br>
$self->{file} = delete $opts{'file'}; # Brackup::File object<br>
$self->{offset} = delete $opts{'offset'};<br>
$self->{length} = delete $opts{'length'};<br>
+ $self->{_raw_chunkref} = delete $opts{'chunkref'};<br>
+ $self->{_raw_digest} = delete $opts{'digest'};<br>
<br>
croak("Unknown options: " . join(', ', keys %opts)) if %opts;<br>
croak("offset not numeric") unless $self->{offset} =~ /^\d+$/;<br>
@@ -55,53 +57,12 @@ sub root {<br>
<br>
sub raw_digest {<br>
my $self = shift;<br>
- return $self->{_raw_digest} ||= $self->_calc_raw_digest;<br>
-}<br>
-<br>
-sub _calc_raw_digest {<br>
- my $self = shift;<br>
-<br>
- my $n_chunks = $self->{file}->chunks<br>
- or die "zero chunks?";<br>
- if ($n_chunks == 1) {<br>
- # don't calculate this chunk's digest.. it's the same as our<br>
- # file's digest, since this chunk spans the entire file.<br>
- die "ASSERT" unless $self->length == $self->{file}->size;<br>
- return $self->{file}->full_digest;<br>
- }<br>
-<br>
- my $cache = $self->root->digest_cache;<br>
- my $key = $self->cachekey;<br>
- my $dig;<br>
-<br>
- if ($dig = $cache->get($key)) {<br>
- return $self->{_raw_digest} = $dig;<br>
- }<br>
-<br>
- my $rchunk = $self->raw_chunkref;<br>
- $dig = "sha1:" . sha1_hex($$rchunk);<br>
-<br>
- $cache->set($key => $dig);<br>
-<br>
- return $self->{_raw_digest} = $dig;<br>
+ return $self->{_raw_digest};<br>
}<br>
<br>
sub raw_chunkref {<br>
my $self = shift;<br>
- return $self->{_raw_chunkref} if $self->{_raw_chunkref};<br>
-<br>
- my $data;<br>
- my $fullpath = $self->{file}->fullpath;<br>
- open(my $fh, $fullpath) or die "Failed to open $fullpath: $!\n";<br>
- binmode($fh);<br>
- seek($fh, $self->{offset}, 0) or die "Couldn't seek: $!\n";<br>
- my $rv = read($fh, $data, $self->{length})<br>
- or die "Failed to read: $!\n";<br>
- unless ($rv == $self->{length}) {<br>
- Carp::confess("Read $rv bytes, not $self->{length}");<br>
- }<br>
-<br>
- return $self->{_raw_chunkref} = \$data;<br>
+ return $self->{_raw_chunkref};<br>
}<br>
<br>
# useful string for targets to key on. of one of the forms:<br>
<font color="#888888">--<br>
<a href="http://1.5.4.5" target="_blank">1.5.4.5</a><br>
<br>
</font></blockquote></div><br>