[PATCH 3/3] Open files only once

Michael Hanselmann public at hansmi.ch
Mon Jul 14 23:05:04 UTC 2008


- Reduced I/O load by only reading its content only once
- Easier error handling (before a file could disappear between two
  calls to open())
---
 lib/Brackup/File.pm            |   92 +++++++++++++++++++++-------------------
 lib/Brackup/PositionedChunk.pm |   47 ++-------------------
 2 files changed, 52 insertions(+), 87 deletions(-)

diff --git a/lib/Brackup/File.pm b/lib/Brackup/File.pm
index 827eff1..f24907c 100644
--- a/lib/Brackup/File.pm
+++ b/lib/Brackup/File.pm
@@ -30,6 +30,8 @@ sub new {
     die "No path provided." unless $self->{path};
     $self->{path} =~ s!^\./!!;
 
+    $self->_read_file();
+
     return $self;
 }
 
@@ -119,18 +121,22 @@ sub find_file_chunker {
 
 sub chunks {
     my $self = shift;
-    # memoized:
-    return @{ $self->{chunks} } if $self->{chunks};
+    return @{$self->{chunks}};
+}
+
+sub _read_file {
+    our ($self) = @_;
 
     # non-files don't have chunks
-    if (!$self->is_file) {
+    unless ($self->is_file) {
         $self->{chunks} = [];
-        return ();
+        return;
     }
 
-    # Get the appropriate FileChunker for this file type,
-    # then pass ourselves to it to get our chunks.
-    my @chunk_list;
+    my $fullpath = $self->fullpath;
+    open(my $fh, '<', $fullpath)
+        or die "Failed to open $fullpath: $!\n";
+    binmode($fh);
 
     # Get the appropriate FileChunker for this file type
     my $chunker = $self->find_file_chunker();
@@ -139,6 +145,11 @@ sub chunks {
     my $last_offset = undef;
     my $last_len = undef;
 
+    my $sha1_file = new Digest::SHA1;
+    my $sha1_chunk = new Digest::SHA1;
+
+    my @chunks;
+
     $chunker->get_offsets($self, sub {
         my ($offset, $len) = @_;
 
@@ -164,10 +175,31 @@ sub chunks {
             die "Chunker didn't start at offset 0";
         }
 
-        push @chunk_list, new Brackup::PositionedChunk(
+        # Optimize for continous reading
+        unless (tell($fh) == $offset) {
+            seek($fh, $offset, 0) or die "Couldn't seek: $!\n";
+        }
+
+        my $data;
+        my $rv = read($fh, $data, $len) or die "Failed to read: $!\n";
+        unless ($rv == $len) {
+            Carp::confess("Read $rv bytes, not $len");
+        }
+
+        # Update digest for complete file
+        $sha1_file->add($data);
+
+        # Generate chunk digest
+        $sha1_chunk->reset();
+        $sha1_chunk->add($data);
+        my $chunk_digest = 'sha1:' . $sha1_chunk->hexdigest();
+
+        push @chunks, new Brackup::PositionedChunk(
             file => $self,
             offset => $offset,
             length => $len,
+            chunkref => \$data,
+            digest => $chunk_digest,
         );
 
         $last_offset = $offset;
@@ -178,46 +210,18 @@ sub chunks {
         die "Chunker didn't return complete file";
     }
 
-    $self->{chunks} = \@chunk_list;
-    return @chunk_list;
-}
+    my $file_digest = 'sha1:' . $sha1_file->hexdigest();
 
-sub full_digest {
-    my $self = shift;
-    return $self->{_full_digest} ||= $self->_calc_full_digest;
-}
-
-sub _calc_full_digest {
-    my $self = shift;
-    return "" unless $self->is_file;
-
-    my $cache = $self->{root}->digest_cache;
-    my $key   = $self->cachekey;
-
-    my $dig = $cache->get($key);
-    return $dig if $dig;
+    $self->{_full_digest} = $file_digest;
 
-    # legacy migration thing... we used to more often store
-    # the chunk digests, not the file digests.  so try that
-    # first...
-    if ($self->chunks == 1) {
-        my ($chunk) = $self->chunks;
-        $dig = $cache->get($chunk->cachekey);
-    }
-
-    unless ($dig) {
-        my $sha1 = Digest::SHA1->new;
-        my $path = $self->fullpath;
-        open (my $fh, $path) or die "Couldn't open $path: $!\n";
-        binmode($fh);
-        $sha1->addfile($fh);
-        close($fh);
+    close($fh);
 
-        $dig = "sha1:" . $sha1->hexdigest;
-    }
+    $self->{chunks} = \@chunks;
+}
 
-    $cache->set($key => $dig);
-    return $dig;
+sub full_digest {
+    my $self = shift;
+    return $self->{_full_digest};
 }
 
 sub link_target {
diff --git a/lib/Brackup/PositionedChunk.pm b/lib/Brackup/PositionedChunk.pm
index b5fabc1..8ed806c 100644
--- a/lib/Brackup/PositionedChunk.pm
+++ b/lib/Brackup/PositionedChunk.pm
@@ -20,6 +20,8 @@ sub new {
     $self->{file}   = delete $opts{'file'};    # Brackup::File object
     $self->{offset} = delete $opts{'offset'};
     $self->{length} = delete $opts{'length'};
+    $self->{_raw_chunkref} = delete $opts{'chunkref'};
+    $self->{_raw_digest} = delete $opts{'digest'};
 
     croak("Unknown options: " . join(', ', keys %opts)) if %opts;
     croak("offset not numeric") unless $self->{offset} =~ /^\d+$/;
@@ -55,53 +57,12 @@ sub root {
 
 sub raw_digest {
     my $self = shift;
-    return $self->{_raw_digest} ||= $self->_calc_raw_digest;
-}
-
-sub _calc_raw_digest {
-    my $self = shift;
-
-    my $n_chunks = $self->{file}->chunks
-        or die "zero chunks?";
-    if ($n_chunks == 1) {
-        # don't calculate this chunk's digest.. it's the same as our
-        # file's digest, since this chunk spans the entire file.
-        die "ASSERT" unless $self->length == $self->{file}->size;
-        return $self->{file}->full_digest;
-    }
-
-    my $cache = $self->root->digest_cache;
-    my $key   = $self->cachekey;
-    my $dig;
-
-    if ($dig = $cache->get($key)) {
-        return $self->{_raw_digest} = $dig;
-    }
-
-    my $rchunk = $self->raw_chunkref;
-    $dig = "sha1:" . sha1_hex($$rchunk);
-
-    $cache->set($key => $dig);
-
-    return $self->{_raw_digest} = $dig;
+    return $self->{_raw_digest};
 }
 
 sub raw_chunkref {
     my $self = shift;
-    return $self->{_raw_chunkref} if $self->{_raw_chunkref};
-
-    my $data;
-    my $fullpath = $self->{file}->fullpath;
-    open(my $fh, $fullpath) or die "Failed to open $fullpath: $!\n";
-    binmode($fh);
-    seek($fh, $self->{offset}, 0) or die "Couldn't seek: $!\n";
-    my $rv = read($fh, $data, $self->{length})
-        or die "Failed to read: $!\n";
-    unless ($rv == $self->{length}) {
-        Carp::confess("Read $rv bytes, not $self->{length}");
-    }
-
-    return $self->{_raw_chunkref} = \$data;
+    return $self->{_raw_chunkref};
 }
 
 # useful string for targets to key on.  of one of the forms:
-- 
1.5.4.5



More information about the brackup mailing list