summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Buetow <paul@buetow.org>2025-02-05 23:08:31 +0200
committerPaul Buetow <paul@buetow.org>2025-02-05 23:08:31 +0200
commitae34fb4dc262c3fb94eac3c381f64e73089cc8be (patch)
tree7c6aedb02fe8c893f6a427682e19e633c033877c
parentf1d8ca050a0d89486574fac62d60fc6ceddc991d (diff)
use default perltidy
-rw-r--r--foostats.pl971
1 files changed, 509 insertions, 462 deletions
diff --git a/foostats.pl b/foostats.pl
index ddb6559..88585b7 100644
--- a/foostats.pl
+++ b/foostats.pl
@@ -6,14 +6,14 @@ use v5.38;
# use strict;
# use warnings;
-use builtin qw(true false);
+use builtin qw(true false);
use experimental qw(builtin);
use feature qw(refaliasing);
no warnings qw(experimental::refaliasing);
# TODO: UNDO
-use diagnostics;
+use diagnostics;
# TODO: Blog post about this script and the new Perl features used.
# TODO NEXT:
@@ -23,557 +23,604 @@ use diagnostics;
# * Print out all UAs, to add new excludes/blocked IPs
package FileHelper {
- use JSON;
-
- sub write ($path, $content) {
- open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!";
- print $fh $content;
- close $fh;
-
- rename "$path.tmp", $path;
- }
-
- sub write_json_gz ($path, $data) {
- my $json = encode_json $data;
-
- say "Writing $path";
- open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!";
- print $fd $json;
- close $fd;
-
- rename "$path.tmp", $path or die "$path.tmp: $!";
- }
-
- sub read_json_gz ($path) {
- say "Reading $path";
- open my $fd, '<:gzip', $path or die "$path: $!";
- my $json = decode_json <$fd>;
- close $fd;
- return $json;
- }
+ use JSON;
+
+ sub write ( $path, $content ) {
+ open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!";
+ print $fh $content;
+ close $fh;
+
+ rename "$path.tmp", $path;
+ }
+
+ sub write_json_gz ( $path, $data ) {
+ my $json = encode_json $data;
+
+ say "Writing $path";
+ open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!";
+ print $fd $json;
+ close $fd;
+
+ rename "$path.tmp", $path or die "$path.tmp: $!";
+ }
+
+ sub read_json_gz ($path) {
+ say "Reading $path";
+ open my $fd, '<:gzip', $path or die "$path: $!";
+ my $json = decode_json <$fd>;
+ close $fd;
+ return $json;
+ }
}
package DateHelper {
- use Time::Piece;
+ use Time::Piece;
- sub last_month_dates () {
- my $today = localtime;
- my @dates;
+ sub last_month_dates () {
+ my $today = localtime;
+ my @dates;
- for my $days_ago (0..30) {
- my $date = $today - ($days_ago * 24 * 60 * 60);
- push @dates, $date->strftime('%Y%m%d');
- }
+ for my $days_ago ( 0 .. 30 ) {
+ my $date = $today - ( $days_ago * 24 * 60 * 60 );
+ push @dates, $date->strftime('%Y%m%d');
+ }
- return @dates;
- }
+ return @dates;
+ }
}
package Foostats::Logreader {
- use Digest::SHA3 'sha3_512_base64';
- use File::stat;
- use PerlIO::gzip;
- use Time::Piece;
- use String::Util qw(contains startswith endswith);
+ use Digest::SHA3 'sha3_512_base64';
+ use File::stat;
+ use PerlIO::gzip;
+ use Time::Piece;
+ use String::Util qw(contains startswith endswith);
+
+ use constant {
+ GEMINI_LOGS_GLOB => '/var/log/daemon*',
+ WEB_LOGS_GLOB => '/var/www/logs/access.log*',
+ };
+
+ sub anonymize_ip ($ip) {
+ my $ip_proto = contains( $ip, ':' ) ? 'IPv6' : 'IPv4';
+ my $ip_hash = sha3_512_base64 $ip;
+ return ( $ip_hash, $ip_proto );
+ }
+
+ sub read_lines ( $glob, $cb ) {
+ my sub year ($path) {
+ localtime( ( stat $path )->mtime )->strftime('%Y');
+ }
+
+ my sub open_file ($path) {
+ my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<';
+ open my $fd, $flag, $path or die "$path: $!";
+ return $fd;
+ }
- use constant {
- GEMINI_LOGS_GLOB => '/var/log/daemon*',
- WEB_LOGS_GLOB => '/var/www/logs/access.log*',
- };
+ my $last = false;
- sub anonymize_ip ($ip) {
- my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4';
- my $ip_hash = sha3_512_base64 $ip;
- return ($ip_hash, $ip_proto);
- }
+ say 'File path glob matches: ' . join( ' ', glob $glob );
- sub read_lines ($glob, $cb) {
- my sub year ($path) { localtime( (stat $path)->mtime )->strftime('%Y') }
-
- my sub open_file ($path) {
- my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<';
- open my $fd, $flag, $path or die "$path: $!";
- return $fd;
- }
-
- my $last = false;
-
- say 'File path glob matches: ' . join(' ', glob $glob);
-
- LAST:
- for my $path ( sort { -M $a <=> -M $b } glob $glob) {
- say "Processing $path";
-
- my $file = open_file $path;
- my $year = year $file;
-
- while (<$file>) {
- next if contains($_, 'logfile turned over');
-
- # last == true means: After this file, don't process more
- $last = true unless defined $cb->($year, split / +/);
- }
-
- say "Closing $path (last:$last)";
- close $file;
- last LAST if $last;
- }
- }
-
- sub parse_web_logs ($last_processed_date, $cb) {
- my sub parse_date ($date) {
- my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S');
- return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S'));
- }
-
- my sub parse_web_line (@line) {
- my ($date, $time) = parse_date $line[4];
- return undef if $date < $last_processed_date;
-
- # X-Forwarded-For?
- my $ip = $line[-2] eq '-' ? $line[1] : $line[-2];
- my ($ip_hash, $ip_proto) = anonymize_ip $ip;
-
- return {
- proto => 'web',
- host => $line[0],
- ip_hash => $ip_hash,
- ip_proto => $ip_proto,
- date => $date,
- time => $time,
- uri_path => $line[7],
- status => $line[9],
- };
- }
-
- read_lines WEB_LOGS_GLOB, sub ($year, @line) { $cb->(parse_web_line @line) };
- }
-
- sub parse_gemini_logs ($last_processed_date, $cb) {
- my sub parse_date ($year, @line) {
- my $timestr = "$line[0] $line[1]";
- return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d");
- }
-
- my sub parse_vger_line ($year, @line) {
- my $full_path = $line[5];
- $full_path =~ s/"//g;
- my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4;
- $uri_path = '' unless defined $uri_path;
-
- return {
- proto => 'gemini',
- host => $host,
- uri_path => "/$uri_path",
- status => $line[6],
- date => int(parse_date($year, @line)),
- time => $line[2],
- };
- }
-
- my sub parse_relayd_line ($year, @line) {
- my $date = int(parse_date($year, @line));
+ LAST:
+ for my $path ( sort { -M $a <=> -M $b } glob $glob ) {
+ say "Processing $path";
- my ($ip_hash, $ip_proto) = anonymize_ip $line[12];
- return {
- ip_hash => $ip_hash,
- ip_proto => $ip_proto,
- date => $date,
- time => $line[2],
- };
+ my $file = open_file $path;
+ my $year = year $file;
+
+ while (<$file>) {
+ next if contains( $_, 'logfile turned over' );
+
+ # last == true means: After this file, don't process more
+ $last = true unless defined $cb->( $year, split / +/ );
+ }
+
+ say "Closing $path (last:$last)";
+ close $file;
+ last LAST if $last;
+ }
}
- # Expect one vger and one relayd log line per event! So collect
- # both events (one from one log line each) and then merge the result hash!
- my ($vger, $relayd);
- read_lines GEMINI_LOGS_GLOB, sub ($year, @line) {
- if ($line[4] eq 'vger:') {
- $vger = parse_vger_line $year, @line;
- } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) {
- $relayd = parse_relayd_line $year, @line;
- return undef if $relayd->{date} < $last_processed_date;
- }
+ sub parse_web_logs ( $last_processed_date, $cb ) {
+ my sub parse_date ($date) {
+ my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' );
+ return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') );
+ }
+
+ my sub parse_web_line (@line) {
+ my ( $date, $time ) = parse_date $line[4];
+ return undef if $date < $last_processed_date;
+
+ # X-Forwarded-For?
+ my $ip = $line[-2] eq '-' ? $line[1] : $line[-2];
+ my ( $ip_hash, $ip_proto ) = anonymize_ip $ip;
+
+ return {
+ proto => 'web',
+ host => $line[0],
+ ip_hash => $ip_hash,
+ ip_proto => $ip_proto,
+ date => $date,
+ time => $time,
+ uri_path => $line[7],
+ status => $line[9],
+ };
+ }
+
+ read_lines WEB_LOGS_GLOB,
+ sub ( $year, @line ) { $cb->( parse_web_line @line ) };
+ }
- if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) {
- $cb->({ %$vger, %$relayd });
- $vger = $relayd = undef;
- }
-
- true;
- };
- }
+ sub parse_gemini_logs ( $last_processed_date, $cb ) {
+ my sub parse_date ( $year, @line ) {
+ my $timestr = "$line[0] $line[1]";
+ return Time::Piece->strptime( $timestr, '%b %d' )
+ ->strftime("$year%m%d");
+ }
+
+ my sub parse_vger_line ( $year, @line ) {
+ my $full_path = $line[5];
+ $full_path =~ s/"//g;
+ my ( $proto, undef, $host, $uri_path ) = split '/', $full_path, 4;
+ $uri_path = '' unless defined $uri_path;
+
+ return {
+ proto => 'gemini',
+ host => $host,
+ uri_path => "/$uri_path",
+ status => $line[6],
+ date => int( parse_date( $year, @line ) ),
+ time => $line[2],
+ };
+ }
+
+ my sub parse_relayd_line ( $year, @line ) {
+ my $date = int( parse_date( $year, @line ) );
+
+ my ( $ip_hash, $ip_proto ) = anonymize_ip $line[12];
+ return {
+ ip_hash => $ip_hash,
+ ip_proto => $ip_proto,
+ date => $date,
+ time => $line[2],
+ };
+ }
+
+ # Expect one vger and one relayd log line per event! So collect
+ # both events (one from one log line each) and then merge the result hash!
+ my ( $vger, $relayd );
+ read_lines GEMINI_LOGS_GLOB, sub ( $year, @line ) {
+ if ( $line[4] eq 'vger:' ) {
+ $vger = parse_vger_line $year, @line;
+ }
+ elsif ( $line[5] eq 'relay' and startswith( $line[6], 'gemini' ) ) {
+ $relayd = parse_relayd_line $year, @line;
+ return undef if $relayd->{date} < $last_processed_date;
+ }
+
+ if ( defined $vger
+ and defined $relayd
+ and $vger->{time} eq $relayd->{time} )
+ {
+ $cb->( { %$vger, %$relayd } );
+ $vger = $relayd = undef;
+ }
+
+ true;
+ };
+ }
- sub parse_logs ($last_web_date, $last_gemini_date) {
- my $agg = Foostats::Aggregator->new;
+ sub parse_logs ( $last_web_date, $last_gemini_date ) {
+ my $agg = Foostats::Aggregator->new;
- say "Last web date: $last_web_date";
- say "Last gemini date: $last_gemini_date";
+ say "Last web date: $last_web_date";
+ say "Last gemini date: $last_gemini_date";
- parse_web_logs $last_web_date, sub ($event) { $agg->add($event) };
- parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) };
+ parse_web_logs $last_web_date, sub ($event) { $agg->add($event) };
+ parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event) };
- return $agg->{stats};
- }
+ return $agg->{stats};
+ }
}
package Foostats::Filter {
- use String::Util qw(contains startswith endswith);
- use constant WARN_ODD => false;
-
- sub new ($class) {
- bless {
- odds => [qw(
- .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin
- .git microsoft.exchange .lua /owa/
- )]
- }, $class;
- }
-
- sub ok ($self, $event) {
- state %blocked = ();
- return false if exists $blocked{$event->{ip_hash}};
+ use String::Util qw(contains startswith endswith);
+ use constant WARN_ODD => false;
+
+ sub new ($class) {
+ bless {
+ odds => [
+ qw(
+ .php wordpress /wp .asp .. robots.txt .env + % HNAP1 /admin
+ .git microsoft.exchange .lua /owa/
+ )
+ ]
+ }, $class;
+ }
- if ($self->odd($event) or $self->excessive($event)) {
- ($blocked{$event->{ip_hash}} //= 0)++;
- return false;
- } else {
- return true;
+ sub ok ( $self, $event ) {
+ state %blocked = ();
+ return false if exists $blocked{ $event->{ip_hash} };
+
+ if ( $self->odd($event) or $self->excessive($event) ) {
+ ( $blocked{ $event->{ip_hash} } //= 0 )++;
+ return false;
+ }
+ else {
+ return true;
+ }
}
- }
- sub odd ($self, $event) {
- \my $uri_path = \$event->{uri_path};
+ sub odd ( $self, $event ) {
+ \my $uri_path = \$event->{uri_path};
- for ($self->{odds}->@*) {
- if (contains($uri_path, $_)) {
- say STDERR "Warn: $uri_path contains $_ and is odd and will therefore be blocked!" if WARN_ODD;
- return true;
- }
+ for ( $self->{odds}->@* ) {
+ if ( contains( $uri_path, $_ ) ) {
+ say STDERR
+"Warn: $uri_path contains $_ and is odd and will therefore be blocked!"
+ if WARN_ODD;
+ return true;
+ }
+ }
+
+ return false;
}
- return false;
- }
+ sub excessive ( $self, $event ) {
+ \my $time = \$event->{time};
+ \my $ip_hash = \$event->{ip_hash};
- sub excessive ($self, $event) {
- \my $time = \$event->{time};
- \my $ip_hash = \$event->{ip_hash};
+ state $last_time = $time; # Time with second: 'HH:MM:SS'
+ state %count = (); # IPs accessing within the same second!
- state $last_time = $time; # Time with second: 'HH:MM:SS'
- state %count = (); # IPs accessing within the same second!
+ if ( $last_time ne $time ) {
+ $last_time = $time;
+ %count = ();
+ return false;
+ }
- if ($last_time ne $time) {
- $last_time = $time;
- %count = ();
- return false;
- }
+ # IP requested site more than once within the same second!?
+ if ( 1 < ++( $count{$ip_hash} //= 0 ) ) {
+ say STDERR "Warn: $ip_hash blocked due to excessive requesting..."
+ if WARN_ODD;
+ return true;
+ }
- # IP requested site more than once within the same second!?
- if (1 < ++($count{$ip_hash} //= 0)) {
- say STDERR "Warn: $ip_hash blocked due to excessive requesting..." if WARN_ODD;
- return true;
+ return false;
}
-
- return false;
- }
}
package Foostats::Aggregator {
- use String::Util qw(contains startswith endswith);
-
- use constant {
- ATOM_FEED_URI => '/gemfeed/atom.xml',
- GEMFEED_URI => '/gemfeed/index.gmi',
- GEMFEED_URI_2 => '/gemfeed/',
- };
-
- sub new ($class) { bless { filter => Foostats::Filter->new, stats => {} }, $class }
-
- sub add ($self, $event) {
- return undef unless defined $event;
-
- my $date = $event->{date};
- my $date_key = $event->{proto} . "_$date";
-
- $self->{stats}{$date_key} //= {
- count => { filtered => 0 },
- feed_ips => { atom_feed => {}, gemfeed => {} },
- page_ips => { hosts => {}, urls => {} },
+ use String::Util qw(contains startswith endswith);
+
+ use constant {
+ ATOM_FEED_URI => '/gemfeed/atom.xml',
+ GEMFEED_URI => '/gemfeed/index.gmi',
+ GEMFEED_URI_2 => '/gemfeed/',
};
- \my $s = \$self->{stats}{$date_key};
- unless ($self->{filter}->ok($event)) {
- $s->{count}{filtered}++;
- return $event;
+ sub new ($class) {
+ bless { filter => Foostats::Filter->new, stats => {} }, $class;
}
- $self->add_count($s, $event);
- $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event);
+ sub add ( $self, $event ) {
+ return undef unless defined $event;
+
+ my $date = $event->{date};
+ my $date_key = $event->{proto} . "_$date";
+
+ $self->{stats}{$date_key} //= {
+ count => { filtered => 0 },
+ feed_ips => { atom_feed => {}, gemfeed => {} },
+ page_ips => { hosts => {}, urls => {} },
+ };
- return $event;
- }
+ \my $s = \$self->{stats}{$date_key};
+ unless ( $self->{filter}->ok($event) ) {
+ $s->{count}{filtered}++;
+ return $event;
+ }
- sub add_count ($self, $stats, $event) {
- \my $c = \$stats->{count};
- \my $e = \$event;
+ $self->add_count( $s, $event );
+ $self->add_page_ips( $s, $event )
+ unless $self->add_feed_ips( $s, $event );
- ($c->{$e->{proto}} //= 0)++;
- ($c->{$e->{ip_proto}} //= 0)++;
- }
+ return $event;
+ }
+
+ sub add_count ( $self, $stats, $event ) {
+ \my $c = \$stats->{count};
+ \my $e = \$event;
- sub add_feed_ips ($self, $stats, $event) {
- \my $f = \$stats->{feed_ips};
- \my $e = \$event;
+ ( $c->{ $e->{proto} } //= 0 )++;
+ ( $c->{ $e->{ip_proto} } //= 0 )++;
+ }
- if (endswith($e->{uri_path}, ATOM_FEED_URI)) {
- ($f->{atom_feed}->{$e->{ip_hash}} //= 0)++;
- } elsif (contains($e->{uri_path}, GEMFEED_URI)) {
- ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
- } elsif (endswith($e->{uri_path}, GEMFEED_URI_2)) {
- ($f->{gemfeed}->{$e->{ip_hash}} //= 0)++;
- } else {
- 0
+ sub add_feed_ips ( $self, $stats, $event ) {
+ \my $f = \$stats->{feed_ips};
+ \my $e = \$event;
+
+ if ( endswith( $e->{uri_path}, ATOM_FEED_URI ) ) {
+ ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ elsif ( contains( $e->{uri_path}, GEMFEED_URI ) ) {
+ ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ elsif ( endswith( $e->{uri_path}, GEMFEED_URI_2 ) ) {
+ ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++;
+ }
+ else {
+ 0;
+ }
}
- }
- sub add_page_ips ($self, $stats, $event) {
- \my $e = \$event;
- \my $p = \$stats->{page_ips};
+ sub add_page_ips ( $self, $stats, $event ) {
+ \my $e = \$event;
+ \my $p = \$stats->{page_ips};
- return if !endswith($e->{uri_path}, '.html')
- && !endswith($e->{uri_path}, '.gmi');
+ return
+ if !endswith( $e->{uri_path}, '.html' )
+ && !endswith( $e->{uri_path}, '.gmi' );
- ($p->{hosts}->{$e->{host}}->{$e->{ip_hash}} //= 0)++;
- ($p->{urls}->{$e->{host}.$e->{uri_path}}->{$e->{ip_hash}} //= 0)++;
- }
+ ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++;
+ ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //=
+ 0 )++;
+ }
}
package Foostats::FileOutputter {
- use JSON;
- use Sys::Hostname;
- use PerlIO::gzip;
-
- sub new ($class, %args) {
- my $self = bless \%args, $class;
- mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir};
-
- return $self;
- }
-
- sub last_processed_date ($self, $proto) {
- my $hostname = hostname();
- my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
- my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0;
-
- return int($date);
- }
-
- sub write ($self) {
- $self->for_dates(sub ($self, $date_key, $stats) {
- my $hostname = hostname();
- my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
- FileHelper::write_json_gz $path, $stats;
- });
- }
-
- sub for_dates ($self, $cb) {
- $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*;
- }
-}
+ use JSON;
+ use Sys::Hostname;
+ use PerlIO::gzip;
-package Foostats::Replicator {
- use JSON;
- use File::Basename;
- use LWP::UserAgent;
- use String::Util qw(endswith);
-
- sub replicate ($stats_dir, $partner_node) {
- say "Replicating from $partner_node";
-
- for my $proto (qw(gemini web)) {
- my $count = 0;
-
- for my $date (DateHelper::last_month_dates) {
- my $file_base = "${proto}_${date}";
- my $dest_path = "${file_base}.$partner_node.json.gz";
-
- replicate_file(
- "https://$partner_node/foostats/$dest_path",
- "$stats_dir/$dest_path",
- $count++ < 3, # Always replicate the newest 3 files.
- );
- }
+ sub new ( $class, %args ) {
+ my $self = bless \%args, $class;
+ mkdir $self->{stats_dir}
+ or die $self->{stats_dir} . ": $!"
+ unless -d $self->{stats_dir};
+
+ return $self;
}
- }
- sub replicate_file ($remote_url, $dest_path, $force) {
- # $dest_path already exists, not replicating it
- return if !$force && -f $dest_path;
+ sub last_processed_date ( $self, $proto ) {
+ my $hostname = hostname();
+ my @processed =
+ glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz";
+ my ($date) =
+ @processed ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ ) : 0;
- say "Replicating $remote_url to $dest_path (force:$force)... ";
- my $response = LWP::UserAgent->new->get($remote_url);
- unless ($response->is_success) {
- say "\nFailed to fetch the file: " . $response->status_line;
- return;
+ return int($date);
}
- FileHelper::write $dest_path, $response->decoded_content;
- say 'done';
- }
-}
+ sub write ($self) {
+ $self->for_dates(
+ sub ( $self, $date_key, $stats ) {
+ my $hostname = hostname();
+ my $path =
+ $self->{stats_dir} . "/${date_key}.$hostname.json.gz";
+ FileHelper::write_json_gz $path, $stats;
+ }
+ );
+ }
-package Foostats::Merger {
- use Data::Dumper; # TODO: UNDO
-
- sub merge ($stats_dir) {
- my %merge;
- $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates;
- return %merge;
- }
-
- sub merge_for_date ($stats_dir, $date) {
- printf "Merging for date %s\n", $date;
-
- my @stats = stats_for_date($stats_dir, $date);
- return {
- feed_ips => feed_ips(@stats),
- count => count(@stats),
- page_ips => page_ips(@stats),
- };
- }
+ sub for_dates ( $self, $cb ) {
+ $cb->( $self, $_, $self->{stats}{$_} ) for sort keys $self->{stats}->%*;
+ }
+}
- sub merge_ips ($a, $b, $key_transform = undef) {
- my sub merge ($a, $b) {
- while (my ($key, $val) = each %$b) {
- $a->{$key} //= 0;
- $a->{$key} += $val;
- }
+package Foostats::Replicator {
+ use JSON;
+ use File::Basename;
+ use LWP::UserAgent;
+ use String::Util qw(endswith);
+
+ sub replicate ( $stats_dir, $partner_node ) {
+ say "Replicating from $partner_node";
+
+ for my $proto (qw(gemini web)) {
+ my $count = 0;
+
+ for my $date (DateHelper::last_month_dates) {
+ my $file_base = "${proto}_${date}";
+ my $dest_path = "${file_base}.$partner_node.json.gz";
+
+ replicate_file(
+ "https://$partner_node/foostats/$dest_path",
+ "$stats_dir/$dest_path",
+ $count++ < 3, # Always replicate the newest 3 files.
+ );
+ }
+ }
}
- my $is_num = qr/^\d+(\.\d+)?$/;
+ sub replicate_file ( $remote_url, $dest_path, $force ) {
- while (my ($key, $val) = each %$b) {
- $key = $key_transform->($key) if defined $key_transform;
-
- if (not exists $a->{$key}) {
- $a->{$key} = $val;
- } elsif (ref($a->{$key}) eq 'HASH' && ref($val) eq 'HASH') {
- merge($a->{$key}, $val);
- } elsif ($a->{$key} =~ $is_num && $val =~ $is_num) {
- $a->{$key} += $val;
- } else {
- die "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n",
- $key, ref($key), $a->{$key}, ref($a->{$key}), $val, ref($val);
- }
+ # $dest_path already exists, not replicating it
+ return if !$force && -f $dest_path;
+
+ say "Replicating $remote_url to $dest_path (force:$force)... ";
+ my $response = LWP::UserAgent->new->get($remote_url);
+ unless ( $response->is_success ) {
+ say "\nFailed to fetch the file: " . $response->status_line;
+ return;
+ }
+
+ FileHelper::write $dest_path, $response->decoded_content;
+ say 'done';
}
- }
+}
- sub feed_ips (@stats) {
- my (%gemini, %web);
+package Foostats::Merger {
+ use Data::Dumper; # TODO: UNDO
- for my $stats (@stats) {
- my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini;
- printf "Merging proto %s feed IPs\n", $stats->{proto};
- merge_ips($merge, $stats->{feed_ips});
+ sub merge ($stats_dir) {
+ my %merge;
+ $merge{$_} = merge_for_date( $stats_dir, $_ )
+ for DateHelper::last_month_dates;
+ return %merge;
}
- my %total;
- merge_ips(\%total, $web{$_}) for keys %web;
- merge_ips(\%total, $gemini{$_}) for keys %gemini;
+ sub merge_for_date ( $stats_dir, $date ) {
+ printf "Merging for date %s\n", $date;
- my %merge = (
- 'Total' => scalar keys %total,
- 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*,
- 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*,
- 'Web Gemfeed' => scalar keys $web{gemfeed}->%*,
- 'Web Atom' => scalar keys $web{atom_feed}->%*,
- );
+ my @stats = stats_for_date( $stats_dir, $date );
+ return {
+ feed_ips => feed_ips(@stats),
+ count => count(@stats),
+ page_ips => page_ips(@stats),
+ };
+ }
- return \%merge;
- }
+ sub merge_ips ( $a, $b, $key_transform = undef ) {
+ my sub merge ( $a, $b ) {
+ while ( my ( $key, $val ) = each %$b ) {
+ $a->{$key} //= 0;
+ $a->{$key} += $val;
+ }
+ }
+
+ my $is_num = qr/^\d+(\.\d+)?$/;
+
+ while ( my ( $key, $val ) = each %$b ) {
+ $key = $key_transform->($key) if defined $key_transform;
+
+ if ( not exists $a->{$key} ) {
+ $a->{$key} = $val;
+ }
+ elsif ( ref( $a->{$key} ) eq 'HASH' && ref($val) eq 'HASH' ) {
+ merge( $a->{$key}, $val );
+ }
+ elsif ( $a->{$key} =~ $is_num && $val =~ $is_num ) {
+ $a->{$key} += $val;
+ }
+ else {
+ die
+"Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n",
+ $key, ref($key), $a->{$key}, ref( $a->{$key} ), $val,
+ ref($val);
+ }
+ }
+ }
- sub count (@stats) {
- my %merge;
+ sub feed_ips (@stats) {
+ my ( %gemini, %web );
+
+ for my $stats (@stats) {
+ my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini;
+ printf "Merging proto %s feed IPs\n", $stats->{proto};
+ merge_ips( $merge, $stats->{feed_ips} );
+ }
+
+ my %total;
+ merge_ips( \%total, $web{$_} ) for keys %web;
+ merge_ips( \%total, $gemini{$_} ) for keys %gemini;
+
+ my %merge = (
+ 'Total' => scalar keys %total,
+ 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*,
+ 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*,
+ 'Web Gemfeed' => scalar keys $web{gemfeed}->%*,
+ 'Web Atom' => scalar keys $web{atom_feed}->%*,
+ );
- for my $stats (@stats) {
- while (my ($key, $val) = each $stats->{count}->%*) {
- $merge{$key} //= 0;
- $merge{$key} += $val;
- }
+ return \%merge;
}
- return \%merge;
- }
+ sub count (@stats) {
+ my %merge;
- sub page_ips (@stats) {
- my %merge = (urls => {}, hosts => {});
+ for my $stats (@stats) {
+ while ( my ( $key, $val ) = each $stats->{count}->%* ) {
+ $merge{$key} //= 0;
+ $merge{$key} += $val;
+ }
+ }
- for my $key (keys %merge) {
- merge_ips($merge{$key}, $_->{page_ips}->{$key}, sub ($key) {
- $key =~ s/\.html$/.../;
- $key =~ s/\.gmi$/.../;
- $key;
- }) for @stats;
+ return \%merge;
+ }
- # Keep only uniq IP count
- $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*;
- }
+ sub page_ips (@stats) {
+ my %merge = ( urls => {}, hosts => {} );
+
+ for my $key ( keys %merge ) {
+ merge_ips(
+ $merge{$key},
+ $_->{page_ips}->{$key},
+ sub ($key) {
+ $key =~ s/\.html$/.../;
+ $key =~ s/\.gmi$/.../;
+ $key;
+ }
+ ) for @stats;
+
+ # Keep only uniq IP count
+ $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%*
+ for keys $merge{$key}->%*;
+ }
+
+ return \%merge;
+ }
- return \%merge;
- }
+ sub stats_for_date ( $stats_dir, $date ) {
+ my @stats;
- sub stats_for_date ($stats_dir, $date) {
- my @stats;
+ for my $proto (qw(gemini web)) {
+ for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) {
+ printf "Reading %s\n", $path;
+ push @stats, FileHelper::read_json_gz($path);
+ @{ $stats[-1] }{qw(proto path)} = ( $proto, $path );
+ }
+ }
- for my $proto (qw(gemini web)) {
- for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) {
- printf "Reading %s\n", $path;
- push @stats, FileHelper::read_json_gz($path);
- @{$stats[-1]}{qw(proto path)} = ($proto, $path);
- }
+ return @stats;
}
-
- return @stats;
- }
}
package Foostats::Reporter {
- use Data::Dumper;
+ use Data::Dumper;
- sub report (%merged) {
- print Dumper %merged;
- }
+ sub report (%merged) {
+ print Dumper %merged;
+ }
}
package main {
- use Getopt::Long;
- use Sys::Hostname;
-
- sub parse_logs ($stats_dir) {
- my $out = Foostats::FileOutputter->new(stats_dir => $stats_dir);
+ use Getopt::Long;
+ use Sys::Hostname;
- $out->{stats} = Foostats::Logreader::parse_logs(
- $out->last_processed_date('web'),
- $out->last_processed_date('gemini'),
- );
+ sub parse_logs ($stats_dir) {
+ my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir );
- $out->write;
- }
-
- my ($parse_logs, $replicate, $report, $all);
-
- # With default values
- my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
- my $partner_node = hostname eq 'fishfinger.buetow.org' ? 'blowfish.buetow.org' : 'fishfinger.buetow.org';
+ $out->{stats} = Foostats::Logreader::parse_logs(
+ $out->last_processed_date('web'),
+ $out->last_processed_date('gemini'),
+ );
- # TODO: Add help output
- GetOptions 'parse-logs!' => \$parse_logs,
- 'replicate!' => \$replicate,
- 'report!' => \$report,
- 'all!' => \$all,
- 'stats-dir=s' => \$stats_dir,
- 'partner-node=s' => \$partner_node;
+ $out->write;
+ }
- parse_logs $stats_dir if $parse_logs or $all;
- Foostats::Replicator::replicate($stats_dir, $partner_node) if $replicate or $all;
- Foostats::Reporter::report(Foostats::Merger::merge($stats_dir)) if $report or $all;
+ my ( $parse_logs, $replicate, $report, $all );
+
+ # With default values
+ my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats';
+ my $partner_node =
+ hostname eq 'fishfinger.buetow.org'
+ ? 'blowfish.buetow.org'
+ : 'fishfinger.buetow.org';
+
+ # TODO: Add help output
+ GetOptions
+ 'parse-logs!' => \$parse_logs,
+ 'replicate!' => \$replicate,
+ 'report!' => \$report,
+ 'all!' => \$all,
+ 'stats-dir=s' => \$stats_dir,
+ 'partner-node=s' => \$partner_node;
+
+ parse_logs $stats_dir if $parse_logs or $all;
+ Foostats::Replicator::replicate( $stats_dir, $partner_node )
+ if $replicate or $all;
+ Foostats::Reporter::report( Foostats::Merger::merge($stats_dir) )
+ if $report or $all;
}