diff options
| author | Paul Buetow <paul@buetow.org> | 2025-02-05 23:08:31 +0200 |
|---|---|---|
| committer | Paul Buetow <paul@buetow.org> | 2025-02-05 23:08:31 +0200 |
| commit | ae34fb4dc262c3fb94eac3c381f64e73089cc8be (patch) | |
| tree | 7c6aedb02fe8c893f6a427682e19e633c033877c | |
| parent | f1d8ca050a0d89486574fac62d60fc6ceddc991d (diff) | |
use default perltidy
| -rw-r--r-- | foostats.pl | 971 |
1 files changed, 509 insertions, 462 deletions
diff --git a/foostats.pl b/foostats.pl index ddb6559..88585b7 100644 --- a/foostats.pl +++ b/foostats.pl @@ -6,14 +6,14 @@ use v5.38; # use strict; # use warnings; -use builtin qw(true false); +use builtin qw(true false); use experimental qw(builtin); use feature qw(refaliasing); no warnings qw(experimental::refaliasing); # TODO: UNDO -use diagnostics; +use diagnostics; # TODO: Blog post about this script and the new Perl features used. # TODO NEXT: @@ -23,557 +23,604 @@ use diagnostics; # * Print out all UAs, to add new excludes/blocked IPs package FileHelper { - use JSON; - - sub write ($path, $content) { - open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!"; - print $fh $content; - close $fh; - - rename "$path.tmp", $path; - } - - sub write_json_gz ($path, $data) { - my $json = encode_json $data; - - say "Writing $path"; - open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!"; - print $fd $json; - close $fd; - - rename "$path.tmp", $path or die "$path.tmp: $!"; - } - - sub read_json_gz ($path) { - say "Reading $path"; - open my $fd, '<:gzip', $path or die "$path: $!"; - my $json = decode_json <$fd>; - close $fd; - return $json; - } + use JSON; + + sub write ( $path, $content ) { + open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!"; + print $fh $content; + close $fh; + + rename "$path.tmp", $path; + } + + sub write_json_gz ( $path, $data ) { + my $json = encode_json $data; + + say "Writing $path"; + open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!"; + print $fd $json; + close $fd; + + rename "$path.tmp", $path or die "$path.tmp: $!"; + } + + sub read_json_gz ($path) { + say "Reading $path"; + open my $fd, '<:gzip', $path or die "$path: $!"; + my $json = decode_json <$fd>; + close $fd; + return $json; + } } package DateHelper { - use Time::Piece; + use Time::Piece; - sub last_month_dates () { - my $today = localtime; - my @dates; + sub last_month_dates () { + my $today = localtime; + my @dates; - for my $days_ago (0..30) { - my $date = $today - ($days_ago * 24 * 60 * 60); - push @dates, $date->strftime('%Y%m%d'); - } + for my $days_ago ( 0 .. 30 ) { + my $date = $today - ( $days_ago * 24 * 60 * 60 ); + push @dates, $date->strftime('%Y%m%d'); + } - return @dates; - } + return @dates; + } } package Foostats::Logreader { - use Digest::SHA3 'sha3_512_base64'; - use File::stat; - use PerlIO::gzip; - use Time::Piece; - use String::Util qw(contains startswith endswith); + use Digest::SHA3 'sha3_512_base64'; + use File::stat; + use PerlIO::gzip; + use Time::Piece; + use String::Util qw(contains startswith endswith); + + use constant { + GEMINI_LOGS_GLOB => '/var/log/daemon*', + WEB_LOGS_GLOB => '/var/www/logs/access.log*', + }; + + sub anonymize_ip ($ip) { + my $ip_proto = contains( $ip, ':' ) ? 'IPv6' : 'IPv4'; + my $ip_hash = sha3_512_base64 $ip; + return ( $ip_hash, $ip_proto ); + } + + sub read_lines ( $glob, $cb ) { + my sub year ($path) { + localtime( ( stat $path )->mtime )->strftime('%Y'); + } + + my sub open_file ($path) { + my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<'; + open my $fd, $flag, $path or die "$path: $!"; + return $fd; + } - use constant { - GEMINI_LOGS_GLOB => '/var/log/daemon*', - WEB_LOGS_GLOB => '/var/www/logs/access.log*', - }; + my $last = false; - sub anonymize_ip ($ip) { - my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4'; - my $ip_hash = sha3_512_base64 $ip; - return ($ip_hash, $ip_proto); - } + say 'File path glob matches: ' . join( ' ', glob $glob ); - sub read_lines ($glob, $cb) { - my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') } - - my sub open_file ($path) { - my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<'; - open my $fd, $flag, $path or die "$path: $!"; - return $fd; - } - - my $last = false; - - say 'File path glob matches: ' . join(' ', glob $glob); - - LAST: - for my $path ( sort { -M $a <=> -M $b } glob $glob) { - say "Processing $path"; - - my $file = open_file $path; - my $year = year $file; - - while (<$file>) { - next if contains($_, 'logfile turned over'); - - # last == true means: After this file, don't process more - $last = true unless defined $cb->($year, split / +/); - } - - say "Closing $path (last:$last)"; - close $file; - last LAST if $last; - } - } - - sub parse_web_logs ($last_processed_date, $cb) { - my sub parse_date ($date) { - my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S'); - return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S')); - } - - my sub parse_web_line (@line) { - my ($date, $time) = parse_date $line[4]; - return undef if $date < $last_processed_date; - - # X-Forwarded-For? - my $ip = $line[-2] eq '-' ? $line[1] : $line[-2]; - my ($ip_hash, $ip_proto) = anonymize_ip $ip; - - return { - proto => 'web', - host => $line[0], - ip_hash => $ip_hash, - ip_proto => $ip_proto, - date => $date, - time => $time, - uri_path => $line[7], - status => $line[9], - }; - } - - read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) }; - } - - sub parse_gemini_logs ($last_processed_date, $cb) { - my sub parse_date ($year, @line) { - my $timestr = "$line[0] $line[1]"; - return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d"); - } - - my sub parse_vger_line ($year, @line) { - my $full_path = $line[5]; - $full_path =~ s/"//g; - my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4; - $uri_path = '' unless defined $uri_path; - - return { - proto => 'gemini', - host => $host, - uri_path => "/$uri_path", - status => $line[6], - date => int(parse_date($year, @line)), - time => $line[2], - }; - } - - my sub parse_relayd_line ($year, @line) { - my $date = int(parse_date($year, @line)); + LAST: + for my $path ( sort { -M $a <=> -M $b } glob $glob ) { + say "Processing $path"; - my ($ip_hash, $ip_proto) = anonymize_ip $line[12]; - return { - ip_hash => $ip_hash, - ip_proto => $ip_proto, - date => $date, - time => $line[2], - }; + my $file = open_file $path; + my $year = year $file; + + while (<$file>) { + next if contains( $_, 'logfile turned over' ); + + # last == true means: After this file, don't process more + $last = true unless defined $cb->( $year, split / +/ ); + } + + say "Closing $path (last:$last)"; + close $file; + last LAST if $last; + } } - # Expect one vger and one relayd log line per event! So collect - # both events (one from one log line each) and then merge the result hash! - my ($vger, $relayd); - read_lines GEMINI_LOGS_GLOB, sub ($year, @line) { - if ($line[4] eq 'vger:') { - $vger = parse_vger_line $year, @line; - } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) { - $relayd = parse_relayd_line $year, @line; - return undef if $relayd->{date} < $last_processed_date; - } + sub parse_web_logs ( $last_processed_date, $cb ) { + my sub parse_date ($date) { + my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' ); + return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') ); + } + + my sub parse_web_line (@line) { + my ( $date, $time ) = parse_date $line[4]; + return undef if $date < $last_processed_date; + + # X-Forwarded-For? + my $ip = $line[-2] eq '-' ? $line[1] : $line[-2]; + my ( $ip_hash, $ip_proto ) = anonymize_ip $ip; + + return { + proto => 'web', + host => $line[0], + ip_hash => $ip_hash, + ip_proto => $ip_proto, + date => $date, + time => $time, + uri_path => $line[7], + status => $line[9], + }; + } + + read_lines WEB_LOGS_GLOB, + sub ( $year, @line ) { $cb->( parse_web_line @line ) }; + } - if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) { - $cb->({ %$vger, %$relayd }); - $vger = $relayd = undef; - } - - true; - }; - } + sub parse_gemini_logs ( $last_processed_date, $cb ) { + my sub parse_date ( $year, @line ) { + my $timestr = "$line[0] $line[1]"; + return Time::Piece->strptime( $timestr, '%b %d' ) + ->strftime("$year%m%d"); + } + + my sub parse_vger_line ( $year, @line ) { + my $full_path = $line[5]; + $full_path =~ s/"//g; + my ( $proto, undef, $host, $uri_path ) = split '/', $full_path, 4; + $uri_path = '' unless defined $uri_path; + + return { + proto => 'gemini', + host => $host, + uri_path => "/$uri_path", + status => $line[6], + date => int( parse_date( $year, @line ) ), + time => $line[2], + }; + } + + my sub parse_relayd_line ( $year, @line ) { + my $date = int( parse_date( $year, @line ) ); + + my ( $ip_hash, $ip_proto ) = anonymize_ip $line[12]; + return { + ip_hash => $ip_hash, + ip_proto => $ip_proto, + date => $date, + time => $line[2], + }; + } + + # Expect one vger and one relayd log line per event! So collect + # both events (one from one log line each) and then merge the result hash! + my ( $vger, $relayd ); + read_lines GEMINI_LOGS_GLOB, sub ( $year, @line ) { + if ( $line[4] eq 'vger:' ) { + $vger = parse_vger_line $year, @line; + } + elsif ( $line[5] eq 'relay' and startswith( $line[6], 'gemini' ) ) { + $relayd = parse_relayd_line $year, @line; + return undef if $relayd->{date} < $last_processed_date; + } + + if ( defined $vger + and defined $relayd + and $vger->{time} eq $relayd->{time} ) + { + $cb->( { %$vger, %$relayd } ); + $vger = $relayd = undef; + } + + true; + }; + } - sub parse_logs ($last_web_date, $last_gemini_date) { - my $agg = Foostats::Aggregator->new; + sub parse_logs ( $last_web_date, $last_gemini_date ) { + my $agg = Foostats::Aggregator->new; - say "Last web date: $last_web_date"; - say "Last gemini date: $last_gemini_date"; + say "Last web date: $last_web_date"; + say "Last gemini date: $last_gemini_date"; - parse_web_logs $last_web_date, sub ($event) { $agg->add($event) }; - parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) }; + parse_web_logs $last_web_date, sub ($event) { $agg->add($event) }; + parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) }; - return $agg->{stats}; - } + return $agg->{stats}; + } } package Foostats::Filter { - use String::Util qw(contains startswith endswith); - use constant WARN_ODD => false; - - sub new ($class) { - bless { - odds => [qw( - .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin - .git microsoft.exchange .lua /owa/ - )] - }, $class; - } - - sub ok ($self, $event) { - state %blocked = (); - return false if exists $blocked{$event->{ip_hash}}; + use String::Util qw(contains startswith endswith); + use constant WARN_ODD => false; + + sub new ($class) { + bless { + odds => [ + qw( + .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin + .git microsoft.exchange .lua /owa/ + ) + ] + }, $class; + } - if ($self->odd($event) or $self->excessive($event)) { - ($blocked{$event->{ip_hash}} //= 0)++; - return false; - } else { - return true; + sub ok ( $self, $event ) { + state %blocked = (); + return false if exists $blocked{ $event->{ip_hash} }; + + if ( $self->odd($event) or $self->excessive($event) ) { + ( $blocked{ $event->{ip_hash} } //= 0 )++; + return false; + } + else { + return true; + } } - } - sub odd ($self, $event) { - \my $uri_path = \$event->{uri_path}; + sub odd ( $self, $event ) { + \my $uri_path = \$event->{uri_path}; - for ($self->{odds}->@*) { - if (contains($uri_path, $_)) { - say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD; - return true; - } + for ( $self->{odds}->@* ) { + if ( contains( $uri_path, $_ ) ) { + say STDERR +"Warn: $uri_path contains $_ and is odd and will therefore be blocked!" + if WARN_ODD; + return true; + } + } + + return false; } - return false; - } + sub excessive ( $self, $event ) { + \my $time = \$event->{time}; + \my $ip_hash = \$event->{ip_hash}; - sub excessive ($self, $event) { - \my $time = \$event->{time}; - \my $ip_hash = \$event->{ip_hash}; + state $last_time = $time; # Time with second: 'HH:MM:SS' + state %count = (); # IPs accessing within the same second! - state $last_time = $time; # Time with second: 'HH:MM:SS' - state %count = (); # IPs accessing within the same second! + if ( $last_time ne $time ) { + $last_time = $time; + %count = (); + return false; + } - if ($last_time ne $time) { - $last_time = $time; - %count = (); - return false; - } + # IP requested site more than once within the same second!? + if ( 1 < ++( $count{$ip_hash} //= 0 ) ) { + say STDERR "Warn: $ip_hash blocked due to excessive requesting..." + if WARN_ODD; + return true; + } - # IP requested site more than once within the same second!? - if (1 < ++($count{$ip_hash} //= 0)) { - say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD; - return true; + return false; } - - return false; - } } package Foostats::Aggregator { - use String::Util qw(contains startswith endswith); - - use constant { - ATOM_FEED_URI => '/gemfeed/atom.xml', - GEMFEED_URI => '/gemfeed/index.gmi', - GEMFEED_URI_2 => '/gemfeed/', - }; - - sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class } - - sub add ($self, $event) { - return undef unless defined $event; - - my $date = $event->{date}; - my $date_key = $event->{proto} . "_$date"; - - $self->{stats}{$date_key} //= { - count => { filtered => 0 }, - feed_ips => { atom_feed => {}, gemfeed => {} }, - page_ips => { hosts => {}, urls => {} }, + use String::Util qw(contains startswith endswith); + + use constant { + ATOM_FEED_URI => '/gemfeed/atom.xml', + GEMFEED_URI => '/gemfeed/index.gmi', + GEMFEED_URI_2 => '/gemfeed/', }; - \my $s = \$self->{stats}{$date_key}; - unless ($self->{filter}->ok($event)) { - $s->{count}{filtered}++; - return $event; + sub new ($class) { + bless { filter => Foostats::Filter->new, stats => {} }, $class; } - $self->add_count($s, $event); - $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event); + sub add ( $self, $event ) { + return undef unless defined $event; + + my $date = $event->{date}; + my $date_key = $event->{proto} . "_$date"; + + $self->{stats}{$date_key} //= { + count => { filtered => 0 }, + feed_ips => { atom_feed => {}, gemfeed => {} }, + page_ips => { hosts => {}, urls => {} }, + }; - return $event; - } + \my $s = \$self->{stats}{$date_key}; + unless ( $self->{filter}->ok($event) ) { + $s->{count}{filtered}++; + return $event; + } - sub add_count ($self, $stats, $event) { - \my $c = \$stats->{count}; - \my $e = \$event; + $self->add_count( $s, $event ); + $self->add_page_ips( $s, $event ) + unless $self->add_feed_ips( $s, $event ); - ($c->{$e->{proto}} //= 0)++; - ($c->{$e->{ip_proto}} //= 0)++; - } + return $event; + } + + sub add_count ( $self, $stats, $event ) { + \my $c = \$stats->{count}; + \my $e = \$event; - sub add_feed_ips ($self, $stats, $event) { - \my $f = \$stats->{feed_ips}; - \my $e = \$event; + ( $c->{ $e->{proto} } //= 0 )++; + ( $c->{ $e->{ip_proto} } //= 0 )++; + } - if (endswith($e->{uri_path}, ATOM_FEED_URI)) { - ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++; - } elsif (contains($e->{uri_path}, GEMFEED_URI)) { - ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; - } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) { - ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++; - } else { - 0 + sub add_feed_ips ( $self, $stats, $event ) { + \my $f = \$stats->{feed_ips}; + \my $e = \$event; + + if ( endswith( $e->{uri_path}, ATOM_FEED_URI ) ) { + ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++; + } + elsif ( contains( $e->{uri_path}, GEMFEED_URI ) ) { + ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++; + } + elsif ( endswith( $e->{uri_path}, GEMFEED_URI_2 ) ) { + ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++; + } + else { + 0; + } } - } - sub add_page_ips ($self, $stats, $event) { - \my $e = \$event; - \my $p = \$stats->{page_ips}; + sub add_page_ips ( $self, $stats, $event ) { + \my $e = \$event; + \my $p = \$stats->{page_ips}; - return if !endswith($e->{uri_path}, '.html') - && !endswith($e->{uri_path}, '.gmi'); + return + if !endswith( $e->{uri_path}, '.html' ) + && !endswith( $e->{uri_path}, '.gmi' ); - ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++; - ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++; - } + ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++; + ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= + 0 )++; + } } package Foostats::FileOutputter { - use JSON; - use Sys::Hostname; - use PerlIO::gzip; - - sub new ($class, %args) { - my $self = bless \%args, $class; - mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir}; - - return $self; - } - - sub last_processed_date ($self, $proto) { - my $hostname = hostname(); - my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; - my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0; - - return int($date); - } - - sub write ($self) { - $self->for_dates(sub ($self, $date_key, $stats) { - my $hostname = hostname(); - my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; - FileHelper::write_json_gz $path, $stats; - }); - } - - sub for_dates ($self, $cb) { - $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*; - } -} + use JSON; + use Sys::Hostname; + use PerlIO::gzip; -package Foostats::Replicator { - use JSON; - use File::Basename; - use LWP::UserAgent; - use String::Util qw(endswith); - - sub replicate ($stats_dir, $partner_node) { - say "Replicating from $partner_node"; - - for my $proto (qw(gemini web)) { - my $count = 0; - - for my $date (DateHelper::last_month_dates) { - my $file_base = "${proto}_${date}"; - my $dest_path = "${file_base}.$partner_node.json.gz"; - - replicate_file( - "https://$partner_node/foostats/$dest_path", - "$stats_dir/$dest_path", - $count++ < 3, # Always replicate the newest 3 files. - ); - } + sub new ( $class, %args ) { + my $self = bless \%args, $class; + mkdir $self->{stats_dir} + or die $self->{stats_dir} . ": $!" + unless -d $self->{stats_dir}; + + return $self; } - } - sub replicate_file ($remote_url, $dest_path, $force) { - # $dest_path already exists, not replicating it - return if !$force && -f $dest_path; + sub last_processed_date ( $self, $proto ) { + my $hostname = hostname(); + my @processed = + glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; + my ($date) = + @processed ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ ) : 0; - say "Replicating $remote_url to $dest_path (force:$force)... "; - my $response = LWP::UserAgent->new->get($remote_url); - unless ($response->is_success) { - say "\nFailed to fetch the file: " . $response->status_line; - return; + return int($date); } - FileHelper::write $dest_path, $response->decoded_content; - say 'done'; - } -} + sub write ($self) { + $self->for_dates( + sub ( $self, $date_key, $stats ) { + my $hostname = hostname(); + my $path = + $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; + FileHelper::write_json_gz $path, $stats; + } + ); + } -package Foostats::Merger { - use Data::Dumper; # TODO: UNDO - - sub merge ($stats_dir) { - my %merge; - $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates; - return %merge; - } - - sub merge_for_date ($stats_dir, $date) { - printf "Merging for date %s\n", $date; - - my @stats = stats_for_date($stats_dir, $date); - return { - feed_ips => feed_ips(@stats), - count => count(@stats), - page_ips => page_ips(@stats), - }; - } + sub for_dates ( $self, $cb ) { + $cb->( $self, $_, $self->{stats}{$_} ) for sort keys $self->{stats}->%*; + } +} - sub merge_ips ($a, $b, $key_transform = undef) { - my sub merge ($a, $b) { - while (my ($key, $val) = each %$b) { - $a->{$key} //= 0; - $a->{$key} += $val; - } +package Foostats::Replicator { + use JSON; + use File::Basename; + use LWP::UserAgent; + use String::Util qw(endswith); + + sub replicate ( $stats_dir, $partner_node ) { + say "Replicating from $partner_node"; + + for my $proto (qw(gemini web)) { + my $count = 0; + + for my $date (DateHelper::last_month_dates) { + my $file_base = "${proto}_${date}"; + my $dest_path = "${file_base}.$partner_node.json.gz"; + + replicate_file( + "https://$partner_node/foostats/$dest_path", + "$stats_dir/$dest_path", + $count++ < 3, # Always replicate the newest 3 files. + ); + } + } } - my $is_num = qr/^\d+(\.\d+)?$/; + sub replicate_file ( $remote_url, $dest_path, $force ) { - while (my ($key, $val) = each %$b) { - $key = $key_transform->($key) if defined $key_transform; - - if (not exists $a->{$key}) { - $a->{$key} = $val; - } elsif (ref($a->{$key}) eq 'HASH' && ref($val) eq 'HASH') { - merge($a->{$key}, $val); - } elsif ($a->{$key} =~ $is_num && $val =~ $is_num) { - $a->{$key} += $val; - } else { - die "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", - $key, ref($key), $a->{$key}, ref($a->{$key}), $val, ref($val); - } + # $dest_path already exists, not replicating it + return if !$force && -f $dest_path; + + say "Replicating $remote_url to $dest_path (force:$force)... "; + my $response = LWP::UserAgent->new->get($remote_url); + unless ( $response->is_success ) { + say "\nFailed to fetch the file: " . $response->status_line; + return; + } + + FileHelper::write $dest_path, $response->decoded_content; + say 'done'; } - } +} - sub feed_ips (@stats) { - my (%gemini, %web); +package Foostats::Merger { + use Data::Dumper; # TODO: UNDO - for my $stats (@stats) { - my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini; - printf "Merging proto %s feed IPs\n", $stats->{proto}; - merge_ips($merge, $stats->{feed_ips}); + sub merge ($stats_dir) { + my %merge; + $merge{$_} = merge_for_date( $stats_dir, $_ ) + for DateHelper::last_month_dates; + return %merge; } - my %total; - merge_ips(\%total, $web{$_}) for keys %web; - merge_ips(\%total, $gemini{$_}) for keys %gemini; + sub merge_for_date ( $stats_dir, $date ) { + printf "Merging for date %s\n", $date; - my %merge = ( - 'Total' => scalar keys %total, - 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*, - 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*, - 'Web Gemfeed' => scalar keys $web{gemfeed}->%*, - 'Web Atom' => scalar keys $web{atom_feed}->%*, - ); + my @stats = stats_for_date( $stats_dir, $date ); + return { + feed_ips => feed_ips(@stats), + count => count(@stats), + page_ips => page_ips(@stats), + }; + } - return \%merge; - } + sub merge_ips ( $a, $b, $key_transform = undef ) { + my sub merge ( $a, $b ) { + while ( my ( $key, $val ) = each %$b ) { + $a->{$key} //= 0; + $a->{$key} += $val; + } + } + + my $is_num = qr/^\d+(\.\d+)?$/; + + while ( my ( $key, $val ) = each %$b ) { + $key = $key_transform->($key) if defined $key_transform; + + if ( not exists $a->{$key} ) { + $a->{$key} = $val; + } + elsif ( ref( $a->{$key} ) eq 'HASH' && ref($val) eq 'HASH' ) { + merge( $a->{$key}, $val ); + } + elsif ( $a->{$key} =~ $is_num && $val =~ $is_num ) { + $a->{$key} += $val; + } + else { + die +"Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", + $key, ref($key), $a->{$key}, ref( $a->{$key} ), $val, + ref($val); + } + } + } - sub count (@stats) { - my %merge; + sub feed_ips (@stats) { + my ( %gemini, %web ); + + for my $stats (@stats) { + my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini; + printf "Merging proto %s feed IPs\n", $stats->{proto}; + merge_ips( $merge, $stats->{feed_ips} ); + } + + my %total; + merge_ips( \%total, $web{$_} ) for keys %web; + merge_ips( \%total, $gemini{$_} ) for keys %gemini; + + my %merge = ( + 'Total' => scalar keys %total, + 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*, + 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*, + 'Web Gemfeed' => scalar keys $web{gemfeed}->%*, + 'Web Atom' => scalar keys $web{atom_feed}->%*, + ); - for my $stats (@stats) { - while (my ($key, $val) = each $stats->{count}->%*) { - $merge{$key} //= 0; - $merge{$key} += $val; - } + return \%merge; } - return \%merge; - } + sub count (@stats) { + my %merge; - sub page_ips (@stats) { - my %merge = (urls => {}, hosts => {}); + for my $stats (@stats) { + while ( my ( $key, $val ) = each $stats->{count}->%* ) { + $merge{$key} //= 0; + $merge{$key} += $val; + } + } - for my $key (keys %merge) { - merge_ips($merge{$key}, $_->{page_ips}->{$key}, sub ($key) { - $key =~ s/\.html$/.../; - $key =~ s/\.gmi$/.../; - $key; - }) for @stats; + return \%merge; + } - # Keep only uniq IP count - $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*; - } + sub page_ips (@stats) { + my %merge = ( urls => {}, hosts => {} ); + + for my $key ( keys %merge ) { + merge_ips( + $merge{$key}, + $_->{page_ips}->{$key}, + sub ($key) { + $key =~ s/\.html$/.../; + $key =~ s/\.gmi$/.../; + $key; + } + ) for @stats; + + # Keep only uniq IP count + $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* + for keys $merge{$key}->%*; + } + + return \%merge; + } - return \%merge; - } + sub stats_for_date ( $stats_dir, $date ) { + my @stats; - sub stats_for_date ($stats_dir, $date) { - my @stats; + for my $proto (qw(gemini web)) { + for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { + printf "Reading %s\n", $path; + push @stats, FileHelper::read_json_gz($path); + @{ $stats[-1] }{qw(proto path)} = ( $proto, $path ); + } + } - for my $proto (qw(gemini web)) { - for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { - printf "Reading %s\n", $path; - push @stats, FileHelper::read_json_gz($path); - @{$stats[-1]}{qw(proto path)} = ($proto, $path); - } + return @stats; } - - return @stats; - } } package Foostats::Reporter { - use Data::Dumper; + use Data::Dumper; - sub report (%merged) { - print Dumper %merged; - } + sub report (%merged) { + print Dumper %merged; + } } package main { - use Getopt::Long; - use Sys::Hostname; - - sub parse_logs ($stats_dir) { - my $out = Foostats::FileOutputter->new(stats_dir => $stats_dir); + use Getopt::Long; + use Sys::Hostname; - $out->{stats} = Foostats::Logreader::parse_logs( - $out->last_processed_date('web'), - $out->last_processed_date('gemini'), - ); + sub parse_logs ($stats_dir) { + my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir ); - $out->write; - } - - my ($parse_logs, $replicate, $report, $all); - - # With default values - my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; - my $partner_node = hostname eq 'fishfinger.buetow.org' ? 'blowfish.buetow.org' : 'fishfinger.buetow.org'; + $out->{stats} = Foostats::Logreader::parse_logs( + $out->last_processed_date('web'), + $out->last_processed_date('gemini'), + ); - # TODO: Add help output - GetOptions 'parse-logs!' => \$parse_logs, - 'replicate!' => \$replicate, - 'report!' => \$report, - 'all!' => \$all, - 'stats-dir=s' => \$stats_dir, - 'partner-node=s' => \$partner_node; + $out->write; + } - parse_logs $stats_dir if $parse_logs or $all; - Foostats::Replicator::replicate($stats_dir, $partner_node) if $replicate or $all; - Foostats::Reporter::report(Foostats::Merger::merge($stats_dir)) if $report or $all; + my ( $parse_logs, $replicate, $report, $all ); + + # With default values + my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; + my $partner_node = + hostname eq 'fishfinger.buetow.org' + ? 'blowfish.buetow.org' + : 'fishfinger.buetow.org'; + + # TODO: Add help output + GetOptions + 'parse-logs!' => \$parse_logs, + 'replicate!' => \$replicate, + 'report!' => \$report, + 'all!' => \$all, + 'stats-dir=s' => \$stats_dir, + 'partner-node=s' => \$partner_node; + + parse_logs $stats_dir if $parse_logs or $all; + Foostats::Replicator::replicate( $stats_dir, $partner_node ) + if $replicate or $all; + Foostats::Reporter::report( Foostats::Merger::merge($stats_dir) ) + if $report or $all; } |
