diff -urP trunk_r1127/server/lib/MogileFS/Config.pm trunk_r1127_patched/server/lib/MogileFS/Config.pm --- trunk_r1127/server/lib/MogileFS/Config.pm 2007-05-17 10:42:53.000000000 +0100 +++ trunk_r1127_patched/server/lib/MogileFS/Config.pm 2007-10-03 19:04:46.000000000 +0100 @@ -300,12 +300,20 @@ die "Doesn't match acceptable format."; }; }; - + my $valid_netmask = sub { + my $n = Net::Netmask->new2($_[0]); + die "Doesn't match an acceptable netmask" unless $n; + }; + # let slave settings go through unmodified, for now. if ($key =~ /^slave_/) { return $del_if_blank }; if ($key eq "enable_rebalance") { return $bool }; if ($key eq "memcache_servers") { return $any }; + # ReplicationPolicy::MultipleNetworks + if ($key eq 'network_zones') { return $any }; + if ($key =~ /^zone_/) { return $valid_netmask }; + if ($key eq "rebalance_policy") { return sub { my $v = shift; return undef unless $v; diff -urP trunk_r1127/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm trunk_r1127_patched/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm --- trunk_r1127/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm 1970-01-01 01:00:00.000000000 +0100 +++ trunk_r1127_patched/server/lib/MogileFS/ReplicationPolicy/MultipleNetworks.pm 2007-10-03 19:22:06.000000000 +0100 @@ -0,0 +1,270 @@ +package MogileFS::ReplicationPolicy::MultipleNetworks; + +use strict; +use base 'MogileFS::ReplicationPolicy'; +use MogileFS::Util qw(weighted_list); +use MogileFS::ReplicationRequest qw(ALL_GOOD TOO_GOOD TEMP_NO_ANSWER); + +my %cache; +my $age; + +sub AVOIDNETWORK { return "AVOIDNETWORK"; } + +sub new { + my ($class, $mindevcount) = @_; + return bless { + mindevcount => $mindevcount, + }, $class; +} + +sub new_from_policy_args { + my ($class, $argref) = @_; + # Note: "MultipleNetworks()" is okay, in which case the 'mindevcount' + # on the class is used. (see below) + $$argref =~ s/^\s* \( \s* (\d*) \s* \) \s*//x + or die "$class failed to parse args: $$argref"; + return $class->new($1) +} + +sub mindevcount { $_[0]{mindevcount} } + +sub replicate_to { + my ($self, %args) = @_; + + my $fid = delete $args{fid}; # fid scalar to copy + my $on_devs = delete $args{on_devs}; # arrayref of device objects + my $all_devs = delete $args{all_devs}; # hashref of { devid => MogileFS::Device } + my $failed = delete $args{failed}; # hashref of { devid => 1 } of failed attempts this round + + # old-style + my $min = delete $args{min}; + $min = $self->{mindevcount} || $min; + + warn "Unknown parameters: " . join(", ", sort keys %args) if %args; + die "Missing parameters" unless $on_devs && $all_devs && $failed && $fid; + + # number of devices we currently live on + my $already_on = @$on_devs; + + # a silly special case, bail out early. + return ALL_GOOD if $min == 1 && $already_on; + + # total disks available which are candidates for having files on them + my $total_disks = scalar grep { $_->dstate->should_have_files } values %$all_devs; + + # if we have two copies and that's all the disks there are + # anywhere, be happy enough + return ALL_GOOD if $already_on >= 2 && $already_on == $total_disks; + + # see which and how many unique hosts/networks we're already on. + my %on_dev; + my %on_host; + my %on_network; + foreach my $dev (@$on_devs) { + $on_host{$dev->hostid} = 1; + $on_dev{$dev->id} = 1; + + my $on_ip = $dev->host->ip; + if ($on_ip) { + my $network = network_for_ip($on_ip); + $on_network{$network->desc} = $network; + } + } + + my $uniq_hosts_on = scalar keys %on_host; + my $uniq_networks_on = scalar keys %on_network || 1; + + my ($total_uniq_hosts, $total_uniq_networks) = unique_hosts_and_networks($all_devs); + + # target as many networks as we can, but not more than min + my $target_networks = ($min < $total_uniq_networks) ? $min : $total_uniq_networks; + + # we're never good if our copies aren't on as many networks as possible + if (($target_networks / $uniq_networks_on) <= 1) { + return TOO_GOOD if $uniq_hosts_on > $min; + return TOO_GOOD if $uniq_hosts_on == $min && $already_on > $min; + + return ALL_GOOD if $uniq_hosts_on == $min; + return ALL_GOOD if $uniq_hosts_on >= $total_uniq_hosts && $already_on >= $min; + } + + # if there are more hosts we're not on yet, we want to exclude devices we're already + # on from our applicable host search. + # also exclude hosts on networks we're already on + my @skip_network = values %on_network; + my %skip_host; # hostid => 1 + if ($uniq_hosts_on < $total_uniq_hosts) { + %skip_host = %on_host; + + if (@skip_network) { + # work out hosts from the devs passed to us + my %seen_host; + foreach my $device (values %$all_devs) { + next if ($seen_host{$device->host->id}++); + + foreach my $disliked_network (@skip_network) { + if (($disliked_network->match($device->host->ip)) and + (not $skip_host{$device->host->id})) { + $skip_host{$device->host->id} = AVOIDNETWORK; + } + } + } + } + } + + my @all_dests = weighted_list map { + [$_, 100 * $_->percent_free] + } grep { + ! $on_dev{$_->devid} && + ! $failed->{$_->devid} && + $_->should_get_replicated_files + } MogileFS::Device->devices; + + return TEMP_NO_ANSWER unless @all_dests; + + my @ideal = grep { ! $skip_host{$_->hostid} } @all_dests; + # wrong network is less desparate than wrong host + my @network_desp = grep { $skip_host{$_->hostid} eq AVOIDNETWORK } @all_dests; + my @host_desp = grep { $skip_host{$_->hostid} && + $skip_host{$_->hostid} ne AVOIDNETWORK } @all_dests; + + my @desp = (@network_desp, @host_desp); + + return MogileFS::ReplicationRequest->new( + ideal => \@ideal, + desperate => \@desp, + ); +} + +# can't just scalar keys %cache to count networks +# might include networks for which we have no hosts yet +sub unique_hosts_and_networks { + my ($devs) = @_; + + my %host; + my %netmask; + foreach my $devid (keys %$devs) { + my $dev = $devs->{$devid}; + next unless $dev->dstate->should_get_repl_files; + + $host{$dev->hostid}++; + + my $ip = $dev->host->ip; + $netmask{network_for_ip($ip)->desc}++; + } + + return (scalar keys %host, scalar keys %netmask || 1); +} + + +{ + my %cache; # '192.168.0.0/24' => Net::Netmask->new2('192.168.0.0/24'); + my $age; # increments everytime we look + + # turn a server ip into a network + # defaults to /16 ranges + # this can be overridden with a "zone_$location" setting per network "zone" and + # a lookup field listing all "zones" + # e.g. + # mogadm settings set network_zones location1,location2 + # mogadm settings set zone_location1 192.168.0.0/24 + # mogadm settings set zone_location2 10.0.0.0/24 + # zone names and netmasks must be unique + sub network_for_ip { + my ($ip) = @_; + + if (not $ip) { # can happen in testing + return Net::Netmask->new('default'); + } + + # clear the cache occasionally + if (($age == 0) or ($age++ > 500)) { + clear_and_build_cache(); + $age = 1; + } + + my $network; + foreach my $zone (keys %cache) { + if ($cache{$zone}->match($ip)) { + $network = $cache{$zone}; + } + } + + if (not $network) { + ($network) = ($ip =~ m/(\d+\.\d+)./); + $network .= '/16'; # default + $network = Net::Netmask->new2($network); + } + + return $network; + } + + sub clear_and_build_cache { + undef %cache; + + my @zones = split(",",MogileFS::Config->server_setting("network_zones")); + + foreach my $zone (@zones) { + my $netmask = MogileFS::Config->server_setting("zone_".$zone); + + if (not $netmask) { + warn "couldn't find network_zone <> check your server settings"; + next; + } + + if ($cache{$netmask}) { + warn "duplicate netmask <$netmask> in network zones. check your server settings"; + } + + $cache{$netmask} = Net::Netmask->new2($netmask); + + if (Net::Netmask::errstr()) { + warn "couldn't parse <$zone> as a netmask. error was <".Net::Netmask::errstr(). + ">. check your server settings"; + } + } + } + + sub stuff_cache { # for testing, or it'll try the db + my ($self, $ip, $netmask) = @_; + + $cache{$ip} = $netmask; + $age = 1; + } +} + +1; + +# Local Variables: +# mode: perl +# c-basic-indent: 4 +# indent-tabs-mode: nil +# End: + +__END__ + +=head1 NAME + +MogileFS::ReplicationPolicy::MultipleNetworks + +=head1 RULES + +This policy tries to put files onto devices which are on different networks, if that isn't possible then devices on the same network are returned as "desperate" options. + +We aim to have as many copies as we can on unique networks, if there are 2 copies on one network and none on another, with a min of 2, we will still over-replicate to the other network. When called from the rebalancer we will therefore rebalance across networks and reduce the correct copy. + +By default we class 2 hosts as being on 2 different networks if they're are on different /16 networks (255.255.0.0). This can be controlled using server settings, with a list of network "zones", and then a definition of a netmask for each "zone". + +mogadm settings set network_zones location1,location2 +mogadm settings set zone_location1 192.168.0.0/24 +mogadm settings set zone_location2 10.0.0.0/24 + +Zone names and netmasks must each be unique. + +=head1 SEE ALSO + +L + +L + +l diff -urP trunk_r1127/server/t/multiple-networks-replpol.t trunk_r1127_patched/server/t/multiple-networks-replpol.t --- trunk_r1127/server/t/multiple-networks-replpol.t 1970-01-01 01:00:00.000000000 +0100 +++ trunk_r1127_patched/server/t/multiple-networks-replpol.t 2007-10-03 19:14:43.000000000 +0100 @@ -0,0 +1,198 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use Test::More; +use FindBin qw($Bin); +use Net::Netmask; + +use MogileFS::Server; +use MogileFS::Util qw(error_code); +use MogileFS::ReplicationPolicy::MultipleNetworks; +require "$Bin/lib/mogtestlib.pl"; + +plan tests => 25; + +# need just the one, so we only have to stuff the cache once +my $polclass = "MogileFS::ReplicationPolicy::MultipleNetworks"; +my $pol = $polclass->new; + +# test that the MultipleHosts stuff still works +# we cope when there are no ips + +# already good. +is(rr("min=2 h1[d1=X d2=_] h2[d3=X d4=_]"), + "all_good", "all good"); + +# need to get it onto host2... +is(rr("min=2 h1[d1=X d2=_] h2[d3=_ d4=_]"), + "ideal(3,4)", "need host2"); + +# still needs to be on host2, even though 2 copies on host1 +is(rr("min=2 h1[d1=X d2=X] h2[d3=_ d4=_]"), + "ideal(3,4)", "need host2, even though 2 on host1"); + +# anywhere will do. (can happen on, say, rebalance) +is(rr("min=2 h1[d1=_ d2=_] h2[d3=_ d4=_]"), + "ideal(1,2,3,4)", "anywhere"); + +# should desperately try d2, since host2 is down +is(rr("min=2 h1[d1=X d2=_] h2=down[d3=_ d4=_]"), + "desperate(2)"); + +# should try host3, since host2 is down +is(rr("min=2 h1[d1=X d2=_] h2=down[d3=_ d4=_] h3[d5=_ d6=_]"), + "ideal(5,6)"); + +# need a copy on a non-dead disk on host1 +is(rr("min=2 h1[d1=_ d2=X,dead] h2=alive[d3=X d4=_]"), + "ideal(1)"); + +# this is an ideal move, since we only have 2 unique hosts: +is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_]"), + "ideal(1,4)"); + +# ... but if we have a 3rd host, it's gotta be there +is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_] h3[d5=_]"), + "ideal(5)"); + +# ... unless that host is down, in which case it's back to 1/4, +# but desperately +is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=_] h3=down[d5=_]"), + "desperate(1,4)"); + +# too good, uniq hosts > min +is(rr("min=2 h1[d1=X d2=_] h2[d3=X d4=_] h3[d5=X]"), + "too_good"); + +# too good, but but with uniq hosts == min +is(rr("min=2 h1[d1=X d2=X] h2[d3=X d4=_]"), + "too_good"); + +# be happy with 3 copies, even though two are on same host (that's our max unique hosts) +is(rr("min=3 h1[d1=_ d2=X] h2[d3=X d4=X]"), + "all_good"); + +## +## +# actual network policy tests +my ($ad1, $ad2) = ("#192.168.0.2#" ,"#192.168.0.3#" ); +my ($ad3, $ad4) = ("#10.0.0.2#" ,"#10.0.0.3#" ); +my ($ad5, $ad6) = ("#146.101.246.2#","#146.101.142.130#"); + +# stuff the cache with the default, otherwise it'll go to the db +$pol->stuff_cache('192.168.0.2' , Net::Netmask->new('192.168.0.0/16')); +$pol->stuff_cache('192.168.0.3' , Net::Netmask->new('192.168.0.0/16')); +$pol->stuff_cache('10.0.0.2' , Net::Netmask->new('10.0.0.0/16')); +$pol->stuff_cache('10.0.0.3' , Net::Netmask->new('10.0.0.0/16')); +$pol->stuff_cache('146.101.246.2' , Net::Netmask->new('146.101.0.0/16')); +$pol->stuff_cache('146.101.142.130', Net::Netmask->new('146.101.0.0/16')); + +# retest some multiple Host logic all on the same network +# already good. (there's only one network) +is(rr("min=2 h1[d1=X d2=_]$ad1 h2[d3=X d4=_]$ad2"), + "all_good", "all good"); + +# need to get it onto host2... +is(rr("min=2 h1[d1=X d2=_]$ad1 h2[d3=_ d4=_]$ad2"), + "desperate(2,3,4)", "need host2"); + +# still needs to be on host2, even though 2 copies on host1 +is(rr("min=2 h1[d1=X d2=X]$ad1 h2[d3=_ d4=_]$ad2"), + "desperate(3,4)", "need host2, even though 2 on host1"); + +# target another network +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad4"), + "ideal(5,6,7,8)","target other network"); # no device 3 or 4 (or 1) in the ideal + +# other network down +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3=down[d5=_ d6=_]$ad3 h4=down[d7=_ d8=_]$ad4"), + "desperate(1,3,4)", "desperate this network"); + +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad5"), + "ideal(5,6,7,8)","include both other networks with three networks"); + +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3=down[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad5"), + "ideal(7,8)","one of three networks down"); + +is(rr("min=2 h1[d1=_ d2=X,dead]$ad1 h2=alive[d3=_ d4=_]$ad2 h3=alive[d5=X d6=_]$ad3"), + "ideal(1,3,4)","dead copies don't exclude a network"); + +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=_ d4=_]$ad2 h3[d5=X d6=_]$ad3"), + "all_good","enough copies on different networks"); + +is(rr("min=2 h1[d1=_ d2=X]$ad1 h2[d3=X d4=X]$ad2"), + "too_good","3 copies on 2 networks with a min of 2 is too good"); + +# too many copies on one network, not enough on another, want to over-replicate +is(rr("min=2 h1[d1=X d2=X]$ad1 h2[d3=X d4=X]$ad2 h3[d5=_ d6=_]$ad3 h4[d7=_ d8=_]$ad4"), + "ideal(5,6,7,8)", "more than min hosts, but all on one network"); + +# mess with netmasks +$pol->stuff_cache('146.101.246.2' , Net::Netmask->new('146.101.246.0/24')); +$pol->stuff_cache('146.101.142.130', Net::Netmask->new('146.101.142.0/24')); + +is(rr("min=2 h1[d1=_ d2=X]$ad6 h2[d3=_ d4=_]$ad5 h3[d5=_ d6=_]$ad4 h4[d7=_ d8=_]$ad3"), + "ideal(3,4,5,6,7,8)","target other network"); # ad5 and ad6 are no longer the same network + +sub rr { + my ($state) = @_; + my $ostate = $state; # original + + MogileFS::Host->t_wipe_singletons; + MogileFS::Device->t_wipe_singletons; + MogileFS::Config->set_config_no_broadcast("min_free_space", 100); + + my $min = 2; + if ($state =~ s/^\bmin=(\d+)\b//) { + $min = $1; + } + + my $hosts = {}; + my $devs = {}; + my $on_devs = []; + + my $parse_error = sub { + die "Can't parse:\n $ostate\n" + }; + while ($state =~ s/\bh(\d+)(?:=(.+?))?\[(.+?)\](#\d+\.\d+\.\d+\.\d+\.?#)?//) { + my ($n, $opts, $devstr, $ip) = ($1, $2, $3, $4); + $opts ||= ""; + die "dup host $n" if $hosts->{$n}; + +# print "1 2 3 4 : <<$1>> <<$2>> <<$3>> <<$4>>\n"; +# print "$state\n"; + + my $h = $hosts->{$n} = MogileFS::Host->of_hostid($n); + $h->t_init($opts || "alive"); + if ($ip) { + $ip =~ s/#//g; + # $h->set_ip($ip); # can't do, is persistent + $h->{hostip} = $ip; + } + + foreach my $ddecl (split(/\s+/, $devstr)) { + $ddecl =~ /^d(\d+)=([_X])(?:,(\w+))?$/ + or $parse_error->(); + my ($dn, $on_not, $status) = ($1, $2, $3); + die "dup device $dn" if $devs->{$dn}; + my $d = $devs->{$dn} = MogileFS::Device->of_devid($dn); + $status ||= "alive"; + $d->t_init($h->id, $status); + if ($on_not eq "X" && $d->dstate->should_have_files) { + push @$on_devs, $d; + } + } + } + $parse_error->() if $state =~ /\S/; + + my $rr = $pol->replicate_to( + fid => 1, + on_devs => $on_devs, + all_devs => $devs, + failed => {}, + min => $min, + ); + return $rr->t_as_string; +} +