Patch to MogileFS::Client to fetch only a range of bytes from a file

Arthur Bebak abebak at fabrikinc.com
Wed May 23 01:12:15 UTC 2007


Arthur Bebak wrote:

>>
>> Arthur, I also couldn't get the patch to apply to svn.  You just want to
>> send me your htdigest line for realm "Danga" and commit it yourself, once
>> you fix up the Content-Range response header check + explosion?
> 
> Happy to. Here's my htdigest:
> 
> arthur:Danga:828465da04cfd1def3759439f0736389

Brad, I tried to commit the patch but I'm not getting authenticated.
Oh well. Here it is again, improved per Bjørn Hansen's comments regarding
checking the returned range header.

This is against version 996 of
api/perl/MogileFS-Client/lib/MogileFS/Client.pm

"patch Client.pm this_patch" applies it cleanly.

Also note that I think I found a bug in how the MogileFS back end
handles ranges. According to RFC 2616 section 14.35.1
"Range: bytes=-500" HTTP header should be treated as "get me the last 500
bytes of a file". Testing shows that in the current MogileFS
back end this is erroneously interpreted as "get me bytes 0 to 500 of
the file". Let me know if you want some more formal bug report.

Thanks and have a good vacation.

--- Client.pm   2007-05-22 17:56:07.000000000 -0700
+++ Client.new  2007-05-22 17:48:34.000000000 -0700
@@ -468,6 +468,214 @@
      return undef;
  }

+=head2 get_file_range
+
+Given a key and a range returns a scalar reference to a string which contains
+the requested byte range of the file.
+
+For example, to get 1000 bytes starting at byte 100 you can do something
+like this:
+
+   %arg_hash = ( "length" => "1000", "offset" => "100");
+   $content_ref = $mogfs->get_file_data_range( $key, %arg_hash );
+
+The same example, using a range:
+
+   %arg_hash = ( "range" => "100-999");
+   $content_ref = $mogfs->get_file_data_range( $key, %arg_hash );
+
+See the definition of the HTTPD "Range:" header in RFC 2616 section 14.35.1
+for details of what the "range" key can look like, but in general assuming a file
+of size 10000 you can do range values like this:
+
+=over 2
+
+=item
+
+The first 500 bytes (byte offsets 0-499, inclusive):     "range" => "0-499"
+
+=item
+
+The second 500 bytes (byte offsets 500-999, inclusive):  "range" => "500-999"
+
+=item
+
+The final 500 bytes (byte offsets 9500-9999, inclusive): "range" => "9500-"
+
+=item
+
+The first and last bytes only (bytes 0 and 9999):        "range" => "0-0,-1"
+
+=item
+
+The final 500 bytes (byte offsets 9500-9999, inclusive): "range" => "-500"
+
+WARNING: At the time of this writing the MogileFS server does not meet the
+RFC 2616 spec in this case and treats this case as "range" => "0-500", which is
+presumably not what you want. Always specify the starting and ending bytes in
+your ranges and you should be OK.
+
+=back
+
+As specified in the WARNING above be careful because not all web servers support
+all of the range header formats from RFC 2616. Test against whatever is running
+on your mogstored nodes. You should usually be safe by specifying the starting
+and ending bytes in your range.
+
+The other way to get a range is to give an offset into the file, and
+specify the length.
+
+So for example, given "length" => 1000, "offset" = 100, you'd get the
+equivelent of "range" => "100-1099". Note that the offset byte is included,
+so in general the formula is:
+
+   $range = $offset . "-" . $length - 1;
+
+If offset is not given, then it is assumed that "offset" => 0. This makes
+it easy to get the first $n bytes of the file:
+
+   $n = 100;
+   %arg_hash = ( "length" => $n );
+   $content_ref = $mogfs->get_file_data_range( $key, %arg_hash );
+
+If the range key is defined, length/offset are ignored.
+
+The file is fetched from the storage nodes using LWP::UserAgent, so
+%arg_hash can also have a "timeout" key which is just passed on to
+that module. The requests is aborted if no activity on the connection to
+the server is observed for "timeout" seconds:
+
+   %arg_hash = ( "length" => 100, "timeout" => 30 );
+   $content_ref = $mogfs->get_file_data_range( $key, %arg_hash );
+
+On error get_file_data_range returns undef.
+
+=cut
+
+sub get_file_data_range {
+    # given a key, load some paths and get data
+    my MogileFS::Client $self = shift;
+    my ($key, %arg_hash) = @_;
+
+    # Let's parse all the optional args
+    my $timeout;
+    if( exists $arg_hash{'timeout'} ) {
+        $timeout = $arg_hash{'timeout'};
+        } # if
+
+    my $range;
+    my $offset;
+    my $length;
+    if( exists $arg_hash{'range'} ) {
+        $range = $arg_hash{'range'};
+        } # if
+    else {
+
+       if( exists $arg_hash{'offset'} ) {
+          $offset = $arg_hash{'offset'}
+          } # if
+       else { $offset = "0"; }
+
+       if( exists $arg_hash{'length'} ) {
+          my $num_bytes = $arg_hash{'length'} + $offset - 1;
+          $range = $offset . "-" . $num_bytes;
+          } # if
+
+       } # else
+
+    my @paths = $self->get_paths($key, 1);
+    return undef unless @paths;
+
+    # iterate over each
+    foreach my $path (@paths) {
+        next unless defined $path;
+        if ($path =~ m!^http://!) {
+            # try via HTTP
+            my $ua = new LWP::UserAgent;
+            $ua->timeout($timeout || 10);
+
+            my $res;
+            if(defined $range) {
+               #
+               # This will creata a request HTTPD header which looks like this:
+               # Range: bytes=$range
+               #
+               $res = $ua->get($path, "Range" => "bytes=$range" );
+                } # if
+            else {
+               $res = $ua->get($path);
+                } # else
+
+            my $range_supported = $res->header("Content-Range");
+
+            unless( $range_supported ) {
+               return undef;
+            } # unless
+
+            if ($res->is_success) {
+                my $contents = $res->content;
+                return \$contents;
+            }
+
+        } else {
+            # open the file from disk
+            open FILE, "<$path" or next;
+            my $contents;
+
+            # If we have a range we have to do some math
+            # and a seek/read on the filehandle
+            if(defined $range) {
+
+               # Size of the file
+               my $fsize = (stat($path))[7];
+
+               # We need to parse the range
+               # Note that we don't check the format very rigorously
+               $range =~ /^(.*)-(.*)$/;
+
+               # starting and ending bytes
+               my $start = $1;
+               my $end = $2;
+
+               # Invalid range = "-"
+               if($start eq "" && $end eq "") { return undef; }
+
+               # range = "-N", count from file end
+               if($start eq "" && $end ne "") {
+                  $offset = $fsize - $end;
+                  $length = $end;
+               } # if
+
+               # range = "N-"
+               if($start ne "" && $end eq "") {
+                  $offset = $start;
+                  $length = $fsize - $start;
+               } # if
+
+               # range = "N-N"
+               if($start ne "" && $end ne "") {
+                  $offset = $start;
+                  $length = $end - $start;
+                  } # if
+
+               binmode FILE;
+               seek(FILE, $offset, 0);
+               my $read_status = read(FILE, $contents, $length);
+               return undef unless defined $read_status;
+            } # if
+
+            # No range, so we just slurp in the entire file
+            else {
+               { local $/ = undef; $contents = <FILE>; }
+            } # else
+
+            close FILE;
+            return \$contents if $contents;
+        }
+    }
+    return undef;
+}
+
  =head2 delete

      $mogc->delete($key);


More information about the mogilefs mailing list