[PATCH 3/3] Open files only once

Tue Jul 15 00:55:50 UTC 2008

Comments:

On Mon, Jul 14, 2008 at 4:05 PM, Michael Hanselmann <public at hansmi.ch>
wrote:

> - Reduced I/O load by only reading its content only once
> - Easier error handling (before a file could disappear between two
>  calls to open())
> ---
>  lib/Brackup/File.pm            |   92
> +++++++++++++++++++++-------------------
>  lib/Brackup/PositionedChunk.pm |   47 ++-------------------
>  2 files changed, 52 insertions(+), 87 deletions(-)
>
> diff --git a/lib/Brackup/File.pm b/lib/Brackup/File.pm
> index 827eff1..f24907c 100644
> --- a/lib/Brackup/File.pm
> +++ b/lib/Brackup/File.pm
> @@ -30,6 +30,8 @@ sub new {
>     die "No path provided." unless $self->{path};
>     $self->{path} =~ s!^\./!!;
>
> +    $self->_read_file();

I prefer style of no parens on nullary method call.

>
> +
>     return $self;
>  }
>
> @@ -119,18 +121,22 @@ sub find_file_chunker {
>
>  sub chunks {
>     my $self = shift;
> -    # memoized:
> -    return @{ $self->{chunks} } if $self->{chunks};
> +    return @{$self->{chunks}};
> +}
> +
> +sub _read_file {
> +    our ($self) = @_;

Wrong use of our here.  You want my.

>     # non-files don't have chunks
> -    if (!$self->is_file) {
> +    unless ($self->is_file) {

I love unless, but I prefer to use it only for exceptional cases, or when it
just reads better.  (the word "unless" itself reads like an exceptional
case)  I'd keep this with "if not".  "If it's not a file" is a common case.

        $self->{chunks} = [];
> -        return ();
> +        return;
>     }

This function is called in list context.  I like to stay explicit about that
in my return type.  I don't want to rely on implicit casts.

> -    # Get the appropriate FileChunker for this file type,
> -    # then pass ourselves to it to get our chunks.
> -    my @chunk_list;
> +    my $fullpath = $self->fullpath;
> +    open(my $fh, '<', $fullpath)
> +        or die "Failed to open $fullpath: $!\n";
> +    binmode($fh);
>
>     # Get the appropriate FileChunker for this file type
>     my $chunker = $self->find_file_chunker();
> @@ -139,6 +145,11 @@ sub chunks {
>     my $last_offset = undef;
>     my $last_len = undef;
>
> +    my $sha1_file = new Digest::SHA1;
> +    my $sha1_chunk = new Digest::SHA1;
> +
> +    my @chunks;
> +
>     $chunker->get_offsets($self, sub {
>         my ($offset, $len) = @_;
>
> @@ -164,10 +175,31 @@ sub chunks {
>             die "Chunker didn't start at offset 0";
>         }
>
> -        push @chunk_list, new Brackup::PositionedChunk(

Brackup::PositionedChunk->new

>
> +        # Optimize for continous reading
> +        unless (tell($fh) == $offset) {
> +            seek($fh, $offset, 0) or die "Couldn't seek: $!\n";
> +        }
> +
> +        my $data;
> +        my $rv = read($fh, $data, $len) or die "Failed to read: $!\n";
> +        unless ($rv == $len) {
> +            Carp::confess("Read $rv bytes, not $len");
> +        }
> +
> +        # Update digest for complete file
> +        $sha1_file->add($data);
> +
> +        # Generate chunk digest
> +        $sha1_chunk->reset();

->reset;

>
> +        $sha1_chunk->add($data);
> +        my $chunk_digest = 'sha1:' . $sha1_chunk->hexdigest();

->hexdigest;

>
> +
> +        push @chunks, new Brackup::PositionedChunk(
>             file => $self,
>             offset => $offset,
>             length => $len,
> +            chunkref => \$data,
> +            digest => $chunk_digest,
>         );

Uh, you do this unconditionally?  Not dependent on the size of the file!?
How am I going to backup a huge file on a low-memory box?

Let's discuss that first before I review more.

- Brad

>         $last_offset = $offset;
> @@ -178,46 +210,18 @@ sub chunks {
>         die "Chunker didn't return complete file";
>     }
>
> -    $self->{chunks} = \@chunk_list;
> -    return @chunk_list;
> -}
> +    my $file_digest = 'sha1:' . $sha1_file->hexdigest();
>
> -sub full_digest {
> -    my $self = shift;
> -    return $self->{_full_digest} ||= $self->_calc_full_digest;
> -}
> -
> -sub _calc_full_digest {
> -    my $self = shift;
> -    return "" unless $self->is_file;
> -
> -    my $cache = $self->{root}->digest_cache;
> -    my $key   = $self->cachekey;
> -
> -    my $dig = $cache->get($key);
> -    return $dig if $dig;
> +    $self->{_full_digest} = $file_digest;
>
> -    # legacy migration thing... we used to more often store
> -    # the chunk digests, not the file digests.  so try that
> -    # first...
> -    if ($self->chunks == 1) {
> -        my ($chunk) = $self->chunks;
> -        $dig = $cache->get($chunk->cachekey);
> -    }
> -
> -    unless ($dig) {
> -        my $sha1 = Digest::SHA1->new;
> -        my $path = $self->fullpath;
> -        open (my $fh, $path) or die "Couldn't open $path: $!\n";
> -        binmode($fh);
> -        $sha1->addfile($fh);
> -        close($fh);
> +    close($fh);
>
> -        $dig = "sha1:" . $sha1->hexdigest;
> -    }
> +    $self->{chunks} = \@chunks;
> +}
>
> -    $cache->set($key => $dig);
> -    return $dig;
> +sub full_digest {
> +    my $self = shift;
> +    return $self->{_full_digest};
>  }
>
>  sub link_target {
> diff --git a/lib/Brackup/PositionedChunk.pm
> b/lib/Brackup/PositionedChunk.pm
> index b5fabc1..8ed806c 100644
> --- a/lib/Brackup/PositionedChunk.pm
> +++ b/lib/Brackup/PositionedChunk.pm
> @@ -20,6 +20,8 @@ sub new {
>     $self->{file}   = delete $opts{'file'};    # Brackup::File object
>     $self->{offset} = delete $opts{'offset'};
>     $self->{length} = delete $opts{'length'};
> +    $self->{_raw_chunkref} = delete $opts{'chunkref'};
> +    $self->{_raw_digest} = delete $opts{'digest'};
>
>     croak("Unknown options: " . join(', ', keys %opts)) if %opts;
>     croak("offset not numeric") unless $self->{offset} =~ /^\d+$/;
> @@ -55,53 +57,12 @@ sub root {
>
>  sub raw_digest {
>     my $self = shift;
> -    return $self->{_raw_digest} ||= $self->_calc_raw_digest;
> -}
> -
> -sub _calc_raw_digest {
> -    my $self = shift;
> -
> -    my $n_chunks = $self->{file}->chunks
> -        or die "zero chunks?";
> -    if ($n_chunks == 1) {
> -        # don't calculate this chunk's digest.. it's the same as our
> -        # file's digest, since this chunk spans the entire file.
> -        die "ASSERT" unless $self->length == $self->{file}->size;
> -        return $self->{file}->full_digest;
> -    }
> -
> -    my $cache = $self->root->digest_cache;
> -    my $key   = $self->cachekey;
> -    my $dig;
> -
> -    if ($dig = $cache->get($key)) {
> -        return $self->{_raw_digest} = $dig;
> -    }
> -
> -    my $rchunk = $self->raw_chunkref;
> -    $dig = "sha1:" . sha1_hex($$rchunk);
> -
> -    $cache->set($key => $dig);
> -
> -    return $self->{_raw_digest} = $dig;
> +    return $self->{_raw_digest};
>  }
>
>  sub raw_chunkref {
>     my $self = shift;
> -    return $self->{_raw_chunkref} if $self->{_raw_chunkref};
> -
> -    my $data;
> -    my $fullpath = $self->{file}->fullpath;
> -    open(my $fh, $fullpath) or die "Failed to open $fullpath: $!\n";
> -    binmode($fh);
> -    seek($fh, $self->{offset}, 0) or die "Couldn't seek: $!\n";
> -    my $rv = read($fh, $data, $self->{length})
> -        or die "Failed to read: $!\n";
> -    unless ($rv == $self->{length}) {
> -        Carp::confess("Read $rv bytes, not $self->{length}");
> -    }
> -
> -    return $self->{_raw_chunkref} = \$data;
> +    return $self->{_raw_chunkref};
>  }
>
>  # useful string for targets to key on.  of one of the forms:
> --
> 1.5.4.5
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.danga.com/pipermail/brackup/attachments/20080714/38031ad3/attachment.htm