use default perltidy

author: Paul Buetow <paul@buetow.org> 2025-02-05 23:08:31 +0200
committer: Paul Buetow <paul@buetow.org> 2025-02-05 23:08:31 +0200
commit: ae34fb4dc262c3fb94eac3c381f64e73089cc8be (patch)
tree: 7c6aedb02fe8c893f6a427682e19e633c033877c
parent: f1d8ca050a0d89486574fac62d60fc6ceddc991d (diff)
1 files changed, 509 insertions, 462 deletions
diff --git a/foostats.pl b/foostats.pl
index ddb6559..88585b7 100644
--- a/foostats.pl
+++ b/foostats.pl
@@ -6,14 +6,14 @@ use v5.38;
 # use strict;
 # use warnings;
 
-use builtin qw(true false);
+use builtin      qw(true false);
 use experimental qw(builtin);
 
 use feature qw(refaliasing);
 no warnings qw(experimental::refaliasing);
 
 # TODO: UNDO
-use diagnostics; 
+use diagnostics;
 
 # TODO: Blog post about this script and the new Perl features used.
 # TODO NEXT:
@@ -23,557 +23,604 @@ use diagnostics;
 # * Print out all UAs, to add new excludes/blocked IPs
 
 package FileHelper {
-  use JSON;
-
-  sub write ($path, $content) {
-    open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!";
-    print $fh $content;
-    close $fh;
-
-    rename "$path.tmp", $path;
-  }
-
-  sub write_json_gz ($path, $data) {
-    my $json = encode_json $data;
-
-    say "Writing $path";
-    open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!";
-    print $fd $json;
-    close $fd;
-
-    rename "$path.tmp", $path or die "$path.tmp: $!";
-  }
-  
-  sub read_json_gz ($path) {
-    say "Reading $path";
-    open my $fd, '<:gzip', $path or die "$path: $!";
-    my $json = decode_json <$fd>;
-    close $fd;
-    return $json;
-  }
+    use JSON;
+
+    sub write ( $path, $content ) {
+        open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!";
+        print $fh $content;
+        close $fh;
+
+        rename "$path.tmp", $path;
+    }
+
+    sub write_json_gz ( $path, $data ) {
+        my $json = encode_json $data;
+
+        say "Writing $path";
+        open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!";
+        print $fd $json;
+        close $fd;
+
+        rename "$path.tmp", $path or die "$path.tmp: $!";
+    }
+
+    sub read_json_gz ($path) {
+        say "Reading $path";
+        open my $fd, '<:gzip', $path or die "$path: $!";
+        my $json = decode_json <$fd>;
+        close $fd;
+        return $json;
+    }
 }
 
 package DateHelper {
-  use Time::Piece;
+    use Time::Piece;
 
-  sub last_month_dates () {
-    my $today = localtime;
-    my @dates;
+    sub last_month_dates () {
+        my $today = localtime;
+        my @dates;
 
-    for my $days_ago (0..30) {
-      my $date = $today - ($days_ago * 24 * 60 * 60);
-      push @dates, $date->strftime('%Y%m%d');
-    }
+        for my $days_ago ( 0 .. 30 ) {
+            my $date = $today - ( $days_ago * 24 * 60 * 60 );
+            push @dates, $date->strftime('%Y%m%d');
+        }
 
-    return @dates;
-  }
+        return @dates;
+    }
 }
 
 package Foostats::Logreader {
-  use Digest::SHA3 'sha3_512_base64';
-  use File::stat;
-  use PerlIO::gzip;
-  use Time::Piece;
-  use String::Util qw(contains startswith endswith);
+    use Digest::SHA3 'sha3_512_base64';
+    use File::stat;
+    use PerlIO::gzip;
+    use Time::Piece;
+    use String::Util qw(contains startswith endswith);
+
+    use constant {
+        GEMINI_LOGS_GLOB => '/var/log/daemon*',
+        WEB_LOGS_GLOB    => '/var/www/logs/access.log*',
+    };
+
+    sub anonymize_ip ($ip) {
+        my $ip_proto = contains( $ip, ':' ) ? 'IPv6' : 'IPv4';
+        my $ip_hash  = sha3_512_base64 $ip;
+        return ( $ip_hash, $ip_proto );
+    }
+
+    sub read_lines ( $glob, $cb ) {
+        my sub year ($path) {
+            localtime( ( stat $path )->mtime )->strftime('%Y');
+        }
+
+        my sub open_file ($path) {
+            my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<';
+            open my $fd, $flag, $path or die "$path: $!";
+            return $fd;
+        }
 
-  use constant {
-    GEMINI_LOGS_GLOB => '/var/log/daemon*',
-    WEB_LOGS_GLOB => '/var/www/logs/access.log*',
-  };
+        my $last = false;
 
-  sub anonymize_ip ($ip) {
-    my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4';
-    my $ip_hash = sha3_512_base64 $ip;
-    return ($ip_hash, $ip_proto);
-  }
+        say 'File path glob matches: ' . join( ' ', glob $glob );
 
-  sub read_lines ($glob, $cb) {
-    my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') }
-
-    my sub open_file ($path) {
-      my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<';
-      open my $fd, $flag, $path or die "$path: $!";
-      return $fd;
-    }
-
-    my $last = false;
-
-    say 'File path glob matches: ' . join(' ', glob $glob);
-
-  LAST:
-    for my $path ( sort { -M $a <=> -M $b } glob $glob) {
-      say "Processing $path";
-
-      my $file = open_file $path;
-      my $year = year $file;
-
-      while (<$file>) {
-        next if contains($_, 'logfile turned over');
-
-        # last == true means: After this file, don't process more
-        $last = true unless defined $cb->($year, split / +/);
-      }
-
-      say "Closing $path (last:$last)";
-      close $file;
-      last LAST if $last;
-    }
-  }
-
-  sub parse_web_logs ($last_processed_date, $cb) {
-    my sub parse_date ($date) {
-      my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S');
-      return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S'));
-    }
-
-    my sub parse_web_line (@line) {
-      my ($date, $time) = parse_date $line[4];
-      return undef if $date < $last_processed_date;
-
-      # X-Forwarded-For?
-      my $ip = $line[-2] eq '-' ? $line[1] : $line[-2];
-      my ($ip_hash, $ip_proto) = anonymize_ip $ip;
-
-      return {
-        proto    => 'web',
-        host     => $line[0],
-        ip_hash  => $ip_hash,
-        ip_proto => $ip_proto,
-        date     => $date,
-        time     => $time,
-        uri_path => $line[7],
-        status   => $line[9],
-      };
-    }
-
-    read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) };
-  }
-
-  sub parse_gemini_logs ($last_processed_date, $cb) {
-    my sub parse_date ($year, @line) {
-      my $timestr = "$line[0] $line[1]";
-      return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d");
-    }
-
-    my sub parse_vger_line ($year, @line) {
-      my $full_path = $line[5];
-      $full_path =~ s/"//g;
-      my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4;
-      $uri_path = '' unless defined $uri_path;
-
-      return {
-        proto    => 'gemini',
-        host     => $host,
-        uri_path => "/$uri_path",
-        status   => $line[6],
-        date     => int(parse_date($year, @line)),
-        time     => $line[2],
-      };
-    }
-
-    my sub parse_relayd_line ($year, @line) {
-      my $date = int(parse_date($year, @line));
+      LAST:
+        for my $path ( sort { -M $a <=> -M $b } glob $glob ) {
+            say "Processing $path";
 
-      my ($ip_hash, $ip_proto) = anonymize_ip $line[12];
-      return {
-        ip_hash  => $ip_hash,
-        ip_proto => $ip_proto,
-        date     => $date,
-        time     => $line[2],
-      };
+            my $file = open_file $path;
+            my $year = year $file;
+
+            while (<$file>) {
+                next if contains( $_, 'logfile turned over' );
+
+                # last == true means: After this file, don't process more
+                $last = true unless defined $cb->( $year, split / +/ );
+            }
+
+            say "Closing $path (last:$last)";
+            close $file;
+            last LAST if $last;
+        }
     }
 
-    # Expect one vger and one relayd log line per event! So collect
-    # both events (one from one log line each) and then merge the result hash!
-    my ($vger, $relayd);
-    read_lines GEMINI_LOGS_GLOB, sub ($year, @line) {
-      if ($line[4] eq 'vger:') {
-        $vger = parse_vger_line $year, @line;
-      } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) {
-        $relayd = parse_relayd_line $year, @line;
-        return undef if $relayd->{date} < $last_processed_date;
-      }
+    sub parse_web_logs ( $last_processed_date, $cb ) {
+        my sub parse_date ($date) {
+            my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' );
+            return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') );
+        }
+
+        my sub parse_web_line (@line) {
+            my ( $date, $time ) = parse_date $line[4];
+            return undef if $date < $last_processed_date;
+
+            # X-Forwarded-For?
+            my $ip = $line[-2] eq '-' ? $line[1] : $line[-2];
+            my ( $ip_hash, $ip_proto ) = anonymize_ip $ip;
+
+            return {
+                proto    => 'web',
+                host     => $line[0],
+                ip_hash  => $ip_hash,
+                ip_proto => $ip_proto,
+                date     => $date,
+                time     => $time,
+                uri_path => $line[7],
+                status   => $line[9],
+            };
+        }
+
+        read_lines WEB_LOGS_GLOB,
+          sub ( $year, @line ) { $cb->( parse_web_line @line ) };
+    }
 
-      if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) {
-        $cb->({ %$vger, %$relayd });
-        $vger = $relayd = undef;
-      }
-
-      true;
-    };
-  }
+    sub parse_gemini_logs ( $last_processed_date, $cb ) {
+        my sub parse_date ( $year, @line ) {
+            my $timestr = "$line[0] $line[1]";
+            return Time::Piece->strptime( $timestr, '%b %d' )
+              ->strftime("$year%m%d");
+        }
+
+        my sub parse_vger_line ( $year, @line ) {
+            my $full_path = $line[5];
+            $full_path =~ s/"//g;
+            my ( $proto, undef, $host, $uri_path ) = split '/', $full_path, 4;
+            $uri_path = '' unless defined $uri_path;
+
+            return {
+                proto    => 'gemini',
+                host     => $host,
+                uri_path => "/$uri_path",
+                status   => $line[6],
+                date     => int( parse_date( $year, @line ) ),
+                time     => $line[2],
+            };
+        }
+
+        my sub parse_relayd_line ( $year, @line ) {
+            my $date = int( parse_date( $year, @line ) );
+
+            my ( $ip_hash, $ip_proto ) = anonymize_ip $line[12];
+            return {
+                ip_hash  => $ip_hash,
+                ip_proto => $ip_proto,
+                date     => $date,
+                time     => $line[2],
+            };
+        }
+
+      # Expect one vger and one relayd log line per event! So collect
+      # both events (one from one log line each) and then merge the result hash!
+        my ( $vger, $relayd );
+        read_lines GEMINI_LOGS_GLOB, sub ( $year, @line ) {
+            if ( $line[4] eq 'vger:' ) {
+                $vger = parse_vger_line $year, @line;
+            }
+            elsif ( $line[5] eq 'relay' and startswith( $line[6], 'gemini' ) ) {
+                $relayd = parse_relayd_line $year, @line;
+                return undef if $relayd->{date} < $last_processed_date;
+            }
+
+            if (    defined $vger
+                and defined $relayd
+                and $vger->{time} eq $relayd->{time} )
+            {
+                $cb->( { %$vger, %$relayd } );
+                $vger = $relayd = undef;
+            }
+
+            true;
+        };
+    }
 
-  sub parse_logs ($last_web_date, $last_gemini_date) {
-    my $agg = Foostats::Aggregator->new;
+    sub parse_logs ( $last_web_date, $last_gemini_date ) {
+        my $agg = Foostats::Aggregator->new;
 
-    say "Last web date: $last_web_date";
-    say "Last gemini date: $last_gemini_date";
+        say "Last web date: $last_web_date";
+        say "Last gemini date: $last_gemini_date";
 
-    parse_web_logs $last_web_date, sub ($event) { $agg->add($event) };
-    parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) };
+        parse_web_logs $last_web_date, sub ($event) { $agg->add($event) };
+        parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) };
 
-    return $agg->{stats};
-  }
+        return $agg->{stats};
+    }
 }
 
 package Foostats::Filter {
-  use String::Util qw(contains startswith endswith);
-  use constant WARN_ODD => false;
-
-  sub new ($class) {
-    bless {
-      odds => [qw(
-        .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin
-        .git microsoft.exchange .lua /owa/ 
-      )]
-    }, $class;
-  }
-
-  sub ok ($self, $event) {
-    state %blocked = ();
-    return false if exists $blocked{$event->{ip_hash}};
+    use String::Util qw(contains startswith endswith);
+    use constant WARN_ODD => false;
+
+    sub new ($class) {
+        bless {
+            odds => [
+                qw(
+                  .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin
+                  .git microsoft.exchange .lua /owa/
+                )
+            ]
+        }, $class;
+    }
 
-    if ($self->odd($event) or $self->excessive($event)) {
-      ($blocked{$event->{ip_hash}} //= 0)++;
-      return false;
-    } else {
-      return true;
+    sub ok ( $self, $event ) {
+        state %blocked = ();
+        return false if exists $blocked{ $event->{ip_hash} };
+
+        if ( $self->odd($event) or $self->excessive($event) ) {
+            ( $blocked{ $event->{ip_hash} } //= 0 )++;
+            return false;
+        }
+        else {
+            return true;
+        }
     }
-  }
 
-  sub odd ($self, $event) {
-    \my $uri_path = \$event->{uri_path};
+    sub odd ( $self, $event ) {
+        \my $uri_path = \$event->{uri_path};
 
-    for ($self->{odds}->@*) {
-      if (contains($uri_path, $_)) {
-        say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD;
-        return true;
-      }
+        for ( $self->{odds}->@* ) {
+            if ( contains( $uri_path, $_ ) ) {
+                say STDERR
+"Warn: $uri_path contains $_ and is odd and will therefore be blocked!"
+                  if WARN_ODD;
+                return true;
+            }
+        }
+
+        return false;
     }
 
-    return false;
-  }
+    sub excessive ( $self, $event ) {
+        \my $time    = \$event->{time};
+        \my $ip_hash = \$event->{ip_hash};
 
-  sub excessive ($self, $event) {
-    \my $time = \$event->{time};
-    \my $ip_hash = \$event->{ip_hash};
+        state $last_time = $time;    # Time with second: 'HH:MM:SS'
+        state %count     = ();       # IPs accessing within the same second!
 
-    state $last_time = $time; # Time with second: 'HH:MM:SS'
-    state %count = (); # IPs accessing within the same second!
+        if ( $last_time ne $time ) {
+            $last_time = $time;
+            %count     = ();
+            return false;
+        }
 
-    if ($last_time ne $time) {
-      $last_time = $time;
-      %count = ();
-      return false;
-    }
+        # IP requested site more than once within the same second!?
+        if ( 1 < ++( $count{$ip_hash} //= 0 ) ) {
+            say STDERR "Warn: $ip_hash blocked due to excessive requesting..."
+              if WARN_ODD;
+            return true;
+        }
 
-    # IP requested site more than once within the same second!?
-    if (1 < ++($count{$ip_hash} //= 0)) {
-      say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD;
-      return true;
+        return false;
     }
-
-    return false;
-  }
 }
 
 package Foostats::Aggregator {
-  use String::Util qw(contains startswith endswith);
-
-  use constant {
-    ATOM_FEED_URI => '/gemfeed/atom.xml',
-    GEMFEED_URI   => '/gemfeed/index.gmi',
-    GEMFEED_URI_2 => '/gemfeed/',
-  };
-
-  sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class }
-
-  sub add ($self, $event) {
-    return undef unless defined $event;
-    
-    my $date = $event->{date};
-    my $date_key = $event->{proto} . "_$date";
-
-    $self->{stats}{$date_key} //= {
-      count    => { filtered  => 0 },
-      feed_ips => { atom_feed => {}, gemfeed => {} },
-      page_ips => { hosts     => {}, urls    => {} },
+    use String::Util qw(contains startswith endswith);
+
+    use constant {
+        ATOM_FEED_URI => '/gemfeed/atom.xml',
+        GEMFEED_URI   => '/gemfeed/index.gmi',
+        GEMFEED_URI_2 => '/gemfeed/',
     };
 
-    \my $s = \$self->{stats}{$date_key};
-    unless ($self->{filter}->ok($event)) {
-      $s->{count}{filtered}++;
-      return $event;
+    sub new ($class) {
+        bless { filter => Foostats::Filter->new, stats => {} }, $class;
     }
 
-    $self->add_count($s, $event);
-    $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event);
+    sub add ( $self, $event ) {
+        return undef unless defined $event;
+
+        my $date     = $event->{date};
+        my $date_key = $event->{proto} . "_$date";
+
+        $self->{stats}{$date_key} //= {
+            count    => { filtered  => 0 },
+            feed_ips => { atom_feed => {}, gemfeed => {} },
+            page_ips => { hosts     => {}, urls    => {} },
+        };
 
-    return $event;
-  }
+        \my $s = \$self->{stats}{$date_key};
+        unless ( $self->{filter}->ok($event) ) {
+            $s->{count}{filtered}++;
+            return $event;
+        }
 
-  sub add_count ($self, $stats, $event) {
-    \my $c = \$stats->{count};
-    \my $e = \$event;
+        $self->add_count( $s, $event );
+        $self->add_page_ips( $s, $event )
+          unless $self->add_feed_ips( $s, $event );
 
-    ($c->{$e->{proto}} //= 0)++;
-    ($c->{$e->{ip_proto}} //= 0)++;
-  }
+        return $event;
+    }
+
+    sub add_count ( $self, $stats, $event ) {
+        \my $c = \$stats->{count};
+        \my $e = \$event;
 
-  sub add_feed_ips ($self, $stats, $event) {
-    \my $f = \$stats->{feed_ips};
-    \my $e = \$event;
+        ( $c->{ $e->{proto} }    //= 0 )++;
+        ( $c->{ $e->{ip_proto} } //= 0 )++;
+    }
 
-    if (endswith($e->{uri_path}, ATOM_FEED_URI)) {
-      ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++;
-    } elsif (contains($e->{uri_path}, GEMFEED_URI)) {
-      ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
-    } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) {
-      ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
-    } else {
-      0
+    sub add_feed_ips ( $self, $stats, $event ) {
+        \my $f = \$stats->{feed_ips};
+        \my $e = \$event;
+
+        if ( endswith( $e->{uri_path}, ATOM_FEED_URI ) ) {
+            ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++;
+        }
+        elsif ( contains( $e->{uri_path}, GEMFEED_URI ) ) {
+            ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+        }
+        elsif ( endswith( $e->{uri_path}, GEMFEED_URI_2 ) ) {
+            ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+        }
+        else {
+            0;
+        }
     }
-  }
 
-  sub add_page_ips ($self, $stats, $event) {
-    \my $e = \$event;
-    \my $p = \$stats->{page_ips};
+    sub add_page_ips ( $self, $stats, $event ) {
+        \my $e = \$event;
+        \my $p = \$stats->{page_ips};
 
-    return if !endswith($e->{uri_path}, '.html') 
-           && !endswith($e->{uri_path}, '.gmi');
+        return
+          if !endswith( $e->{uri_path}, '.html' )
+          && !endswith( $e->{uri_path}, '.gmi' );
 
-    ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++;
-    ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++;
-  }
+        ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++;
+        ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //=
+              0 )++;
+    }
 }
 
 package Foostats::FileOutputter {
-  use JSON;
-  use Sys::Hostname;      
-  use PerlIO::gzip;
-  
-  sub new ($class, %args) {
-    my $self = bless \%args, $class;
-    mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir};
-
-    return $self;
-  }
-
-  sub last_processed_date ($self, $proto) {
-    my $hostname = hostname();
-    my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
-    my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0;
-
-    return int($date);
-  }
-
-  sub write ($self) {
-    $self->for_dates(sub ($self, $date_key, $stats) {
-      my $hostname = hostname();
-      my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
-      FileHelper::write_json_gz $path, $stats;
-    });
-  }
-  
-  sub for_dates ($self, $cb) {
-    $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*;
-  }
-}
+    use JSON;
+    use Sys::Hostname;
+    use PerlIO::gzip;
 
-package Foostats::Replicator {
-  use JSON;
-  use File::Basename;
-  use LWP::UserAgent;
-  use String::Util qw(endswith);
-  
-  sub replicate ($stats_dir, $partner_node) {
-    say "Replicating from $partner_node";
-
-    for my $proto (qw(gemini web)) {
-      my $count = 0;
-
-      for my $date (DateHelper::last_month_dates) {
-        my $file_base = "${proto}_${date}";
-        my $dest_path = "${file_base}.$partner_node.json.gz";
-
-        replicate_file(
-          "https://$partner_node/foostats/$dest_path", 
-          "$stats_dir/$dest_path",
-          $count++ < 3, # Always replicate the newest 3 files.
-        );
-      }
+    sub new ( $class, %args ) {
+        my $self = bless \%args, $class;
+        mkdir $self->{stats_dir}
+          or die $self->{stats_dir} . ": $!"
+          unless -d $self->{stats_dir};
+
+        return $self;
     }
-  }
 
-  sub replicate_file ($remote_url, $dest_path, $force) {
-    # $dest_path already exists, not replicating it
-    return if !$force && -f $dest_path;
+    sub last_processed_date ( $self, $proto ) {
+        my $hostname = hostname();
+        my @processed =
+          glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
+        my ($date) =
+          @processed ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ ) : 0;
 
-    say "Replicating $remote_url to $dest_path (force:$force)... ";
-    my $response = LWP::UserAgent->new->get($remote_url);
-    unless ($response->is_success) {
-      say "\nFailed to fetch the file: " . $response->status_line;
-      return;
+        return int($date);
     }
 
-    FileHelper::write $dest_path, $response->decoded_content;
-    say 'done';
-  }
-}
+    sub write ($self) {
+        $self->for_dates(
+            sub ( $self, $date_key, $stats ) {
+                my $hostname = hostname();
+                my $path =
+                  $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
+                FileHelper::write_json_gz $path, $stats;
+            }
+        );
+    }
 
-package Foostats::Merger {
-  use Data::Dumper; # TODO: UNDO
-
-  sub merge ($stats_dir) {
-    my %merge;
-    $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates;
-    return %merge;
-  }
-
-  sub merge_for_date ($stats_dir, $date) {
-    printf "Merging for date %s\n", $date;
-
-    my @stats = stats_for_date($stats_dir, $date);
-    return {
-      feed_ips => feed_ips(@stats),
-      count    => count(@stats),
-      page_ips => page_ips(@stats),
-    };
-  }
+    sub for_dates ( $self, $cb ) {
+        $cb->( $self, $_, $self->{stats}{$_} ) for sort keys $self->{stats}->%*;
+    }
+}
 
-  sub merge_ips ($a, $b, $key_transform = undef) {
-    my sub merge ($a, $b) {
-      while (my ($key, $val) = each %$b) {
-        $a->{$key} //= 0;
-        $a->{$key} += $val;
-      }
+package Foostats::Replicator {
+    use JSON;
+    use File::Basename;
+    use LWP::UserAgent;
+    use String::Util qw(endswith);
+
+    sub replicate ( $stats_dir, $partner_node ) {
+        say "Replicating from $partner_node";
+
+        for my $proto (qw(gemini web)) {
+            my $count = 0;
+
+            for my $date (DateHelper::last_month_dates) {
+                my $file_base = "${proto}_${date}";
+                my $dest_path = "${file_base}.$partner_node.json.gz";
+
+                replicate_file(
+                    "https://$partner_node/foostats/$dest_path",
+                    "$stats_dir/$dest_path",
+                    $count++ < 3,    # Always replicate the newest 3 files.
+                );
+            }
+        }
     }
 
-    my $is_num = qr/^\d+(\.\d+)?$/;
+    sub replicate_file ( $remote_url, $dest_path, $force ) {
 
-    while (my ($key, $val) = each %$b) {
-      $key = $key_transform->($key) if defined $key_transform;
-      
-      if (not exists $a->{$key}) {
-        $a->{$key} = $val;
-      } elsif (ref($a->{$key}) eq 'HASH' && ref($val) eq 'HASH') {
-        merge($a->{$key}, $val);
-      } elsif ($a->{$key} =~ $is_num && $val =~ $is_num) {
-        $a->{$key} += $val;
-      } else {
-        die "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n",
-          $key, ref($key), $a->{$key}, ref($a->{$key}), $val, ref($val);
-      }
+        # $dest_path already exists, not replicating it
+        return if !$force && -f $dest_path;
+
+        say "Replicating $remote_url to $dest_path (force:$force)... ";
+        my $response = LWP::UserAgent->new->get($remote_url);
+        unless ( $response->is_success ) {
+            say "\nFailed to fetch the file: " . $response->status_line;
+            return;
+        }
+
+        FileHelper::write $dest_path, $response->decoded_content;
+        say 'done';
     }
-  }
+}
 
-  sub feed_ips (@stats) {
-    my (%gemini, %web);
+package Foostats::Merger {
+    use Data::Dumper;    # TODO: UNDO
 
-    for my $stats (@stats) {
-      my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini;
-      printf "Merging proto %s feed IPs\n", $stats->{proto};
-      merge_ips($merge, $stats->{feed_ips});
+    sub merge ($stats_dir) {
+        my %merge;
+        $merge{$_} = merge_for_date( $stats_dir, $_ )
+          for DateHelper::last_month_dates;
+        return %merge;
     }
 
-    my %total;
-    merge_ips(\%total, $web{$_}) for keys %web;
-    merge_ips(\%total, $gemini{$_}) for keys %gemini;
+    sub merge_for_date ( $stats_dir, $date ) {
+        printf "Merging for date %s\n", $date;
 
-    my %merge = (
-      'Total'          => scalar keys %total,
-      'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*,
-      'Gemini Atom'    => scalar keys $gemini{atom_feed}->%*,
-      'Web Gemfeed'    => scalar keys $web{gemfeed}->%*,
-      'Web Atom'       => scalar keys $web{atom_feed}->%*,
-    );
+        my @stats = stats_for_date( $stats_dir, $date );
+        return {
+            feed_ips => feed_ips(@stats),
+            count    => count(@stats),
+            page_ips => page_ips(@stats),
+        };
+    }
 
-    return \%merge;
-  }
+    sub merge_ips ( $a, $b, $key_transform = undef ) {
+        my sub merge ( $a, $b ) {
+            while ( my ( $key, $val ) = each %$b ) {
+                $a->{$key} //= 0;
+                $a->{$key} += $val;
+            }
+        }
+
+        my $is_num = qr/^\d+(\.\d+)?$/;
+
+        while ( my ( $key, $val ) = each %$b ) {
+            $key = $key_transform->($key) if defined $key_transform;
+
+            if ( not exists $a->{$key} ) {
+                $a->{$key} = $val;
+            }
+            elsif ( ref( $a->{$key} ) eq 'HASH' && ref($val) eq 'HASH' ) {
+                merge( $a->{$key}, $val );
+            }
+            elsif ( $a->{$key} =~ $is_num && $val =~ $is_num ) {
+                $a->{$key} += $val;
+            }
+            else {
+                die
+"Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n",
+                  $key, ref($key), $a->{$key}, ref( $a->{$key} ), $val,
+                  ref($val);
+            }
+        }
+    }
 
-  sub count (@stats) {
-    my %merge; 
+    sub feed_ips (@stats) {
+        my ( %gemini, %web );
+
+        for my $stats (@stats) {
+            my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini;
+            printf "Merging proto %s feed IPs\n", $stats->{proto};
+            merge_ips( $merge, $stats->{feed_ips} );
+        }
+
+        my %total;
+        merge_ips( \%total, $web{$_} )    for keys %web;
+        merge_ips( \%total, $gemini{$_} ) for keys %gemini;
+
+        my %merge = (
+            'Total'          => scalar keys %total,
+            'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*,
+            'Gemini Atom'    => scalar keys $gemini{atom_feed}->%*,
+            'Web Gemfeed'    => scalar keys $web{gemfeed}->%*,
+            'Web Atom'       => scalar keys $web{atom_feed}->%*,
+        );
 
-    for my $stats (@stats) {
-      while (my ($key, $val) = each $stats->{count}->%*) {
-        $merge{$key} //= 0;
-        $merge{$key} += $val;
-      }
+        return \%merge;
     }
 
-    return \%merge;
-  }
+    sub count (@stats) {
+        my %merge;
 
-  sub page_ips (@stats) {
-     my %merge = (urls => {}, hosts => {});
+        for my $stats (@stats) {
+            while ( my ( $key, $val ) = each $stats->{count}->%* ) {
+                $merge{$key} //= 0;
+                $merge{$key} += $val;
+            }
+        }
 
-     for my $key (keys %merge) {
-       merge_ips($merge{$key}, $_->{page_ips}->{$key}, sub ($key) {
-         $key =~ s/\.html$/.../;
-         $key =~ s/\.gmi$/.../;
-         $key;
-       }) for @stats;
+        return \%merge;
+    }
 
-       # Keep only uniq IP count
-       $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*;
-     }
+    sub page_ips (@stats) {
+        my %merge = ( urls => {}, hosts => {} );
+
+        for my $key ( keys %merge ) {
+            merge_ips(
+                $merge{$key},
+                $_->{page_ips}->{$key},
+                sub ($key) {
+                    $key =~ s/\.html$/.../;
+                    $key =~ s/\.gmi$/.../;
+                    $key;
+                }
+            ) for @stats;
+
+            # Keep only uniq IP count
+            $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%*
+              for keys $merge{$key}->%*;
+        }
+
+        return \%merge;
+    }
 
-     return \%merge;
-  }
+    sub stats_for_date ( $stats_dir, $date ) {
+        my @stats;
 
-  sub stats_for_date ($stats_dir, $date) {
-    my @stats;
+        for my $proto (qw(gemini web)) {
+            for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) {
+                printf "Reading %s\n", $path;
+                push @stats, FileHelper::read_json_gz($path);
+                @{ $stats[-1] }{qw(proto path)} = ( $proto, $path );
+            }
+        }
 
-    for my $proto (qw(gemini web)) {
-      for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) {
-        printf "Reading %s\n", $path;
-        push @stats, FileHelper::read_json_gz($path);
-        @{$stats[-1]}{qw(proto path)} = ($proto, $path);
-      }
+        return @stats;
     }
-
-    return @stats;
-  }
 }
 
 package Foostats::Reporter {
-  use Data::Dumper;
+    use Data::Dumper;
 
-  sub report (%merged) {
-    print Dumper %merged;  
-  }
+    sub report (%merged) {
+        print Dumper %merged;
+    }
 }
 
 package main {
-  use Getopt::Long;
-  use Sys::Hostname;
-
-  sub parse_logs ($stats_dir) {
-    my $out = Foostats::FileOutputter->new(stats_dir => $stats_dir);
+    use Getopt::Long;
+    use Sys::Hostname;
 
-    $out->{stats} = Foostats::Logreader::parse_logs(
-      $out->last_processed_date('web'),
-      $out->last_processed_date('gemini'),
-    );
+    sub parse_logs ($stats_dir) {
+        my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir );
 
-    $out->write;
-  }
-
-  my ($parse_logs, $replicate, $report, $all);
-
-  # With default values
-  my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
-  my $partner_node = hostname eq 'fishfinger.buetow.org'  ? 'blowfish.buetow.org' : 'fishfinger.buetow.org';
+        $out->{stats} = Foostats::Logreader::parse_logs(
+            $out->last_processed_date('web'),
+            $out->last_processed_date('gemini'),
+        );
 
-  # TODO: Add help output
-  GetOptions 'parse-logs!'    => \$parse_logs,
-             'replicate!'     => \$replicate,
-             'report!'        => \$report,
-             'all!'           => \$all,
-             'stats-dir=s'    => \$stats_dir,
-             'partner-node=s' => \$partner_node;
+        $out->write;
+    }
 
-  parse_logs $stats_dir if $parse_logs or $all;
-  Foostats::Replicator::replicate($stats_dir, $partner_node) if $replicate or $all;
-  Foostats::Reporter::report(Foostats::Merger::merge($stats_dir)) if $report or $all;
+    my ( $parse_logs, $replicate, $report, $all );
+
+    # With default values
+    my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
+    my $partner_node =
+      hostname eq 'fishfinger.buetow.org'
+      ? 'blowfish.buetow.org'
+      : 'fishfinger.buetow.org';
+
+    # TODO: Add help output
+    GetOptions
+      'parse-logs!'    => \$parse_logs,
+      'replicate!'     => \$replicate,
+      'report!'        => \$report,
+      'all!'           => \$all,
+      'stats-dir=s'    => \$stats_dir,
+      'partner-node=s' => \$partner_node;
+
+    parse_logs $stats_dir if $parse_logs or $all;
+    Foostats::Replicator::replicate( $stats_dir, $partner_node )
+      if $replicate or $all;
+    Foostats::Reporter::report( Foostats::Merger::merge($stats_dir) )
+      if $report or $all;
 }
author	Paul Buetow <paul@buetow.org>	2025-02-05 23:08:31 +0200
committer	Paul Buetow <paul@buetow.org>	2025-02-05 23:08:31 +0200
commit	ae34fb4dc262c3fb94eac3c381f64e73089cc8be (patch)
tree	7c6aedb02fe8c893f6a427682e19e633c033877c
parent	f1d8ca050a0d89486574fac62d60fc6ceddc991d (diff)