diff options
| -rw-r--r-- | foostats.pl | 914 | ||||
| -rw-r--r-- | t/tmp_filter_log | 3 |
2 files changed, 489 insertions, 428 deletions
diff --git a/foostats.pl b/foostats.pl index 4e32a9f..2b2b2fc 100644 --- a/foostats.pl +++ b/foostats.pl @@ -27,34 +27,34 @@ use constant VERSION => 'v0.1.0'; package FileHelper { use JSON; - sub write ( $path, $content ) { + sub write ($path, $content) { open my $fh, '>', "$path.tmp" - or die "\nCannot open file: $!"; + or die "\nCannot open file: $!"; print $fh $content; close $fh; rename - "$path.tmp", - $path; + "$path.tmp", + $path; } - sub write_json_gz ( $path, $data ) { + sub write_json_gz ($path, $data) { my $json = encode_json $data; say "Writing $path"; open my $fd, '>:gzip', "$path.tmp" - or die "$path.tmp: $!"; + or die "$path.tmp: $!"; print $fd $json; close $fd; rename "$path.tmp", $path - or die "$path.tmp: $!"; + or die "$path.tmp: $!"; } sub read_json_gz ($path) { say "Reading $path"; open my $fd, '<:gzip', $path - or die "$path: $!"; + or die "$path: $!"; my $json = decode_json <$fd>; close $fd; return $json; @@ -62,9 +62,9 @@ package FileHelper { sub read_lines ($path) { my @lines; - open( my $fh, '<', $path ) - or die "$path: $!"; - chomp( @lines = <$fh> ); + open(my $fh, '<', $path) + or die "$path: $!"; + chomp(@lines = <$fh>); close($fh); return @lines; } @@ -77,18 +77,18 @@ package DateHelper { my $today = localtime; my @dates; - for my $days_ago ( 0 .. 30 ) { - my $date = $today - ( $days_ago * 24 * 60 * 60 ); + for my $days_ago (0 .. 30) { + my $date = $today - ($days_ago * 24 * 60 * 60); push - @dates, - $date->strftime('%Y%m%d'); + @dates, + $date->strftime('%Y%m%d'); } return @dates; } sub last_n_months_day_dates ($months) { - my $today = localtime; + my $today = localtime; my $start_year = $today->year; my $start_month = $today->mon - $months; while ($start_month <= 0) { $start_month += 12; $start_year--; } @@ -98,7 +98,7 @@ package DateHelper { my $t = $start; while ($t <= $today) { push @dates, $t->strftime('%Y%m%d'); - $t += 24 * 60 * 60; # one day + $t += 24 * 60 * 60; # one day } return @dates; } @@ -117,34 +117,34 @@ package Foostats::Logreader { sub anonymize_ip ($ip) { my $ip_proto = - contains( $ip, ':' ) - ? 'IPv6' - : 'IPv4'; + contains($ip, ':') + ? 'IPv6' + : 'IPv4'; my $ip_hash = sha3_512_base64 $ip; - return ( $ip_hash, $ip_proto ); + return ($ip_hash, $ip_proto); } - sub read_lines ( $glob, $cb ) { + sub read_lines ($glob, $cb) { my sub year ($path) { - localtime( ( stat $path )->mtime )->strftime('%Y'); + localtime((stat $path)->mtime)->strftime('%Y'); } my sub open_file ($path) { my $flag = - $path =~ /\.gz$/ - ? '<:gzip' - : '<'; + $path =~ /\.gz$/ + ? '<:gzip' + : '<'; open my $fd, $flag, $path - or die "$path: $!"; + or die "$path: $!"; return $fd; } my $last = false; - say 'File path glob matches: ' . join( ' ', glob $glob ); + say 'File path glob matches: ' . join(' ', glob $glob); - LAST: - for my $path ( sort { -M $a <=> -M $b } glob $glob ) { + LAST: + for my $path (sort { -M $a <=> -M $b } glob $glob) { say "Processing $path"; my $file = open_file $path; @@ -152,37 +152,37 @@ package Foostats::Logreader { while (<$file>) { next - if contains( $_, 'logfile turned over' ); + if contains($_, 'logfile turned over'); # last == true means: After this file, don't process more $last = true - unless defined $cb->( $year, split / +/ ); + unless defined $cb->($year, split / +/); } say "Closing $path (last:$last)"; close $file; last LAST - if $last; + if $last; } } - sub parse_web_logs ( $last_processed_date, $cb ) { + sub parse_web_logs ($last_processed_date, $cb) { my sub parse_date ($date) { - my $t = Time::Piece->strptime( $date, '[%d/%b/%Y:%H:%M:%S' ); - return ( $t->strftime('%Y%m%d'), $t->strftime('%H%M%S') ); + my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S'); + return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S')); } my sub parse_web_line (@line) { - my ( $date, $time ) = parse_date $line [4]; + my ($date, $time) = parse_date $line [4]; return undef - if $date < $last_processed_date; + if $date < $last_processed_date; # X-Forwarded-For? my $ip = - $line[-2] eq '-' - ? $line[1] - : $line[-2]; - my ( $ip_hash, $ip_proto ) = anonymize_ip $ip; + $line[-2] eq '-' + ? $line[1] + : $line[-2]; + my ($ip_hash, $ip_proto) = anonymize_ip $ip; return { proto => 'web', @@ -196,42 +196,41 @@ package Foostats::Logreader { }; } - read_lines web_logs_glob(), sub ( $year, @line ) { - $cb->( parse_web_line @line ); + read_lines web_logs_glob(), sub ($year, @line) { + $cb->(parse_web_line @line); }; } - sub parse_gemini_logs ( $last_processed_date, $cb ) { - my sub parse_date ( $year, @line ) { + sub parse_gemini_logs ($last_processed_date, $cb) { + my sub parse_date ($year, @line) { my $timestr = "$line[0] $line[1]"; - return Time::Piece->strptime( $timestr, '%b %d' ) - ->strftime("$year%m%d"); + return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d"); } - my sub parse_vger_line ( $year, @line ) { + my sub parse_vger_line ($year, @line) { my $full_path = $line[5]; $full_path =~ s/"//g; - my ( $proto, undef, $host, $uri_path ) = - split '/', - $full_path, - 4; + my ($proto, undef, $host, $uri_path) = + split '/', + $full_path, + 4; $uri_path = '' - unless defined $uri_path; + unless defined $uri_path; return { proto => 'gemini', host => $host, uri_path => "/$uri_path", status => $line[6], - date => int( parse_date( $year, @line ) ), + date => int(parse_date($year, @line)), time => $line[2], }; } - my sub parse_relayd_line ( $year, @line ) { - my $date = int( parse_date( $year, @line ) ); + my sub parse_relayd_line ($year, @line) { + my $date = int(parse_date($year, @line)); - my ( $ip_hash, $ip_proto ) = anonymize_ip $line [12]; + my ($ip_hash, $ip_proto) = anonymize_ip $line [12]; return { ip_hash => $ip_hash, ip_proto => $ip_proto, @@ -240,26 +239,26 @@ package Foostats::Logreader { }; } - # Expect one vger and one relayd log line per event! So collect - # both events (one from one log line each) and then merge the result hash! - my ( $vger, $relayd ); - read_lines gemini_logs_glob(), sub ( $year, @line ) { - if ( $line[4] eq 'vger:' ) { + # Expect one vger and one relayd log line per event! So collect + # both events (one from one log line each) and then merge the result hash! + my ($vger, $relayd); + read_lines gemini_logs_glob(), sub ($year, @line) { + if ($line[4] eq 'vger:') { $vger = parse_vger_line $year, @line; } - elsif ( $line[5] eq 'relay' - and startswith( $line[6], 'gemini' ) ) + elsif ($line[5] eq 'relay' + and startswith($line[6], 'gemini')) { $relayd = parse_relayd_line $year, @line; return undef - if $relayd->{date} < $last_processed_date; + if $relayd->{date} < $last_processed_date; } if ( defined $vger and defined $relayd - and $vger->{time} eq $relayd->{time} ) + and $vger->{time} eq $relayd->{time}) { - $cb->( { %$vger, %$relayd } ); + $cb->({ %$vger, %$relayd }); $vger = $relayd = undef; } @@ -267,9 +266,8 @@ package Foostats::Logreader { }; } - sub parse_logs ( $last_web_date, $last_gemini_date, $odds_file, $odds_log ) - { - my $agg = Foostats::Aggregator->new( $odds_file, $odds_log ); + sub parse_logs ($last_web_date, $last_gemini_date, $odds_file, $odds_log) { + my $agg = Foostats::Aggregator->new($odds_file, $odds_log); say "Last web date: $last_web_date"; say "Last gemini date: $last_gemini_date"; @@ -289,26 +287,26 @@ package Foostats::Logreader { package Foostats::Filter { use String::Util qw(contains startswith endswith); - sub new ( $class, $odds_file, $log_path ) { + sub new ($class, $odds_file, $log_path) { say "Logging filter to $log_path"; my @odds = FileHelper::read_lines($odds_file); bless { odds => \@odds, log_path => $log_path - }, - $class; + }, + $class; } - sub ok ( $self, $event ) { + sub ok ($self, $event) { state %blocked = (); return false - if exists $blocked{ $event->{ip_hash} }; + if exists $blocked{ $event->{ip_hash} }; if ( $self->odd($event) - or $self->excessive($event) ) + or $self->excessive($event)) { - ( $blocked{ $event->{ip_hash} } //= 0 )++; + ($blocked{ $event->{ip_hash} } //= 0)++; return false; } else { @@ -316,54 +314,52 @@ package Foostats::Filter { } } - sub odd ( $self, $event ) { + sub odd ($self, $event) { \my $uri_path = \$event->{uri_path}; - for ( $self->{odds}->@* ) { + for ($self->{odds}->@*) { next if !defined $_ || $_ eq '' || /^\s*#/; next - unless contains( $uri_path, $_ ); + unless contains($uri_path, $_); - $self->log( 'WARN', $uri_path, - "contains $_ and is odd and will therefore be blocked!" ); + $self->log('WARN', $uri_path, "contains $_ and is odd and will therefore be blocked!"); return true; } - $self->log( 'OK', $uri_path, "appears fine..." ); + $self->log('OK', $uri_path, "appears fine..."); return false; } - sub log ( $self, $severity, $subject, $message ) { + sub log ($self, $severity, $subject, $message) { state %dedup; # Don't log if path was already logged return - if exists $dedup{$subject}; + if exists $dedup{$subject}; $dedup{$subject} = 1; - open( my $fh, '>>', $self->{log_path} ) - or die $self->{log_path} . ": $!"; + open(my $fh, '>>', $self->{log_path}) + or die $self->{log_path} . ": $!"; print $fh "$severity: $subject $message\n"; close($fh); } - sub excessive ( $self, $event ) { + sub excessive ($self, $event) { \my $time = \$event->{time}; \my $ip_hash = \$event->{ip_hash}; state $last_time = $time; # Time with second: 'HH:MM:SS' state %count = (); # IPs accessing within the same second! - if ( $last_time ne $time ) { + if ($last_time ne $time) { $last_time = $time; %count = (); return false; } # IP requested site more than once within the same second!? - if ( 1 < ++( $count{$ip_hash} //= 0 ) ) { - $self->log( 'WARN', $ip_hash, - "blocked due to excessive requesting..." ); + if (1 < ++($count{$ip_hash} //= 0)) { + $self->log('WARN', $ip_hash, "blocked due to excessive requesting..."); return true; } @@ -380,17 +376,17 @@ package Foostats::Aggregator { GEMFEED_URI_2 => '/gemfeed/', }; - sub new ( $class, $odds_file, $odds_log ) { + sub new ($class, $odds_file, $odds_log) { bless { - filter => Foostats::Filter->new( $odds_file, $odds_log ), + filter => Foostats::Filter->new($odds_file, $odds_log), stats => {} - }, - $class; + }, + $class; } - sub add ( $self, $event ) { + sub add ($self, $event) { return undef - unless defined $event; + unless defined $event; my $date = $event->{date}; my $date_key = $event->{proto} . "_$date"; @@ -400,9 +396,7 @@ package Foostats::Aggregator { # - feed_ips: unique IPs per feed type (atom_feed, gemfeed) # - page_ips: unique IPs per host and per URL $self->{stats}{$date_key} //= { - count => { - filtered => 0, - }, + count => { filtered => 0, }, feed_ips => { atom_feed => {}, gemfeed => {}, @@ -414,56 +408,56 @@ package Foostats::Aggregator { }; \my $s = \$self->{stats}{$date_key}; - unless ( $self->{filter}->ok($event) ) { + unless ($self->{filter}->ok($event)) { $s->{count}{filtered}++; return $event; } - $self->add_count( $s, $event ); - $self->add_page_ips( $s, $event ) - unless $self->add_feed_ips( $s, $event ); + $self->add_count($s, $event); + $self->add_page_ips($s, $event) + unless $self->add_feed_ips($s, $event); return $event; } - sub add_count ( $self, $stats, $event ) { + sub add_count ($self, $stats, $event) { \my $c = \$stats->{count}; \my $e = \$event; - ( $c->{ $e->{proto} } //= 0 )++; - ( $c->{ $e->{ip_proto} } //= 0 )++; + ($c->{ $e->{proto} } //= 0)++; + ($c->{ $e->{ip_proto} } //= 0)++; } - sub add_feed_ips ( $self, $stats, $event ) { + sub add_feed_ips ($self, $stats, $event) { \my $f = \$stats->{feed_ips}; \my $e = \$event; # Atom feed (exact path match, allow optional query string) - if ( $e->{uri_path} =~ m{^/gemfeed/atom\.xml(?:[?#].*)?$} ) { - ( $f->{atom_feed}->{ $e->{ip_hash} } //= 0 )++; + if ($e->{uri_path} =~ m{^/gemfeed/atom\.xml(?:[?#].*)?$}) { + ($f->{atom_feed}->{ $e->{ip_hash} } //= 0)++; return 1; } # Gemfeed index: '/gemfeed/' or '/gemfeed/index.gmi' (optionally with query) - if ( $e->{uri_path} =~ m{^/gemfeed/(?:index\.gmi)?(?:[?#].*)?$} ) { - ( $f->{gemfeed}->{ $e->{ip_hash} } //= 0 )++; + if ($e->{uri_path} =~ m{^/gemfeed/(?:index\.gmi)?(?:[?#].*)?$}) { + ($f->{gemfeed}->{ $e->{ip_hash} } //= 0)++; return 1; } return 0; } - sub add_page_ips ( $self, $stats, $event ) { + sub add_page_ips ($self, $stats, $event) { \my $e = \$event; \my $p = \$stats->{page_ips}; return - if !endswith( $e->{uri_path}, '.html' ) - && !endswith( $e->{uri_path}, '.gmi' ); + if !endswith($e->{uri_path}, '.html') + && !endswith($e->{uri_path}, '.gmi'); - ( $p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0 )++; - ( $p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= - 0 )++; + ($p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0)++; + ($p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= + 0)++; } } @@ -472,43 +466,41 @@ package Foostats::FileOutputter { use Sys::Hostname; use PerlIO::gzip; - sub new ( $class, %args ) { + sub new ($class, %args) { my $self = bless \%args, $class; mkdir $self->{stats_dir} - or die $self->{stats_dir} . ": $!" - unless -d $self->{stats_dir}; + or die $self->{stats_dir} . ": $!" + unless -d $self->{stats_dir}; return $self; } - sub last_processed_date ( $self, $proto ) { - my $hostname = hostname(); - my @processed = - glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; + sub last_processed_date ($self, $proto) { + my $hostname = hostname(); + my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; my ($date) = - @processed - ? ( $processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/ ) - : 0; + @processed + ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) + : 0; return int($date); } sub write ($self) { $self->for_dates( - sub ( $self, $date_key, $stats ) { + sub ($self, $date_key, $stats) { my $hostname = hostname(); - my $path = - $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; + my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; FileHelper::write_json_gz - $path, - $stats; + $path, + $stats; } ); } - sub for_dates ( $self, $cb ) { - $cb->( $self, $_, $self->{stats}{$_} ) for sort - keys $self->{stats}->%*; + sub for_dates ($self, $cb) { + $cb->($self, $_, $self->{stats}{$_}) for sort + keys $self->{stats}->%*; } } @@ -518,7 +510,7 @@ package Foostats::Replicator { use LWP::UserAgent; use String::Util qw(endswith); - sub replicate ( $stats_dir, $partner_node ) { + sub replicate ($stats_dir, $partner_node) { say "Replicating from $partner_node"; for my $proto (qw(gemini web)) { @@ -532,51 +524,50 @@ package Foostats::Replicator { "https://$partner_node/foostats/$dest_path", "$stats_dir/$dest_path", $count++ - < - 3 + < + 3 , # Always replicate the newest 3 files. ); } } } - sub replicate_file ( $remote_url, $dest_path, $force ) { + sub replicate_file ($remote_url, $dest_path, $force) { # $dest_path already exists, not replicating it return - if !$force - && -f $dest_path; + if !$force + && -f $dest_path; say "Replicating $remote_url to $dest_path (force:$force)... "; my $response = LWP::UserAgent->new->get($remote_url); - unless ( $response->is_success ) { + unless ($response->is_success) { say "\nFailed to fetch the file: " . $response->status_line; return; } FileHelper::write - $dest_path, - $response->decoded_content; + $dest_path, + $response->decoded_content; say 'done'; } } package Foostats::Merger { - # Removed Data::Dumper (debug-only) per review. + # Removed Data::Dumper (debug-only) per review. sub merge ($stats_dir) { my %merge; - $merge{$_} = merge_for_date( $stats_dir, $_ ) - for DateHelper::last_month_dates; + $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates; return %merge; } - sub merge_for_date ( $stats_dir, $date ) { + sub merge_for_date ($stats_dir, $date) { printf - "Merging for date %s\n", - $date; + "Merging for date %s\n", + $date; - my @stats = stats_for_date( $stats_dir, $date ); + my @stats = stats_for_date($stats_dir, $date); return { feed_ips => feed_ips(@stats), count => count(@stats), @@ -584,9 +575,9 @@ package Foostats::Merger { }; } - sub merge_ips ( $a, $b, $key_transform = undef ) { - my sub merge ( $a, $b ) { - while ( my ( $key, $val ) = each %$b ) { + sub merge_ips ($a, $b, $key_transform = undef) { + my sub merge ($a, $b) { + while (my ($key, $val) = each %$b) { $a->{$key} //= 0; $a->{$key} += $val; } @@ -594,52 +585,52 @@ package Foostats::Merger { my $is_num = qr/^\d+(\.\d+)?$/; - while ( my ( $key, $val ) = each %$b ) { + while (my ($key, $val) = each %$b) { $key = $key_transform->($key) - if defined $key_transform; + if defined $key_transform; - if ( not exists $a->{$key} ) { + if (not exists $a->{$key}) { $a->{$key} = $val; } - elsif (ref( $a->{$key} ) eq 'HASH' - && ref($val) eq 'HASH' ) + elsif (ref($a->{$key}) eq 'HASH' + && ref($val) eq 'HASH') { - merge( $a->{$key}, $val ); + merge($a->{$key}, $val); } elsif ($a->{$key} =~ $is_num - && $val =~ $is_num ) + && $val =~ $is_num) { $a->{$key} += $val; } else { die -"Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", - $key, - ref($key), $a->{$key}, - ref( $a->{$key} ), - $val, - ref($val); + "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", + $key, + ref($key), $a->{$key}, + ref($a->{$key}), + $val, + ref($val); } } } sub feed_ips (@stats) { - my ( %gemini, %web ); + my (%gemini, %web); for my $stats (@stats) { my $merge = - $stats->{proto} eq 'web' - ? \%web - : \%gemini; + $stats->{proto} eq 'web' + ? \%web + : \%gemini; printf - "Merging proto %s feed IPs\n", - $stats->{proto}; - merge_ips( $merge, $stats->{feed_ips} ); + "Merging proto %s feed IPs\n", + $stats->{proto}; + merge_ips($merge, $stats->{feed_ips}); } my %total; - merge_ips( \%total, $web{$_} ) for keys %web; - merge_ips( \%total, $gemini{$_} ) for keys %gemini; + merge_ips(\%total, $web{$_}) for keys %web; + merge_ips(\%total, $gemini{$_}) for keys %gemini; my %merge = ( 'Total' => scalar keys %total, @@ -656,7 +647,7 @@ package Foostats::Merger { my %merge; for my $stats (@stats) { - while ( my ( $key, $val ) = each $stats->{count}->%* ) { + while (my ($key, $val) = each $stats->{count}->%*) { $merge{$key} //= 0; $merge{$key} += $val; } @@ -671,7 +662,7 @@ package Foostats::Merger { hosts => {} ); - for my $key ( keys %merge ) { + for my $key (keys %merge) { merge_ips( $merge{$key}, $_->{page_ips}->{$key}, @@ -683,25 +674,24 @@ package Foostats::Merger { ) for @stats; # Keep only uniq IP count - $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* - for keys $merge{$key}->%*; + $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*; } return \%merge; } - sub stats_for_date ( $stats_dir, $date ) { + sub stats_for_date ($stats_dir, $date) { my @stats; for my $proto (qw(gemini web)) { for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { printf - "Reading %s\n", - $path; + "Reading %s\n", + $path; push - @stats, - FileHelper::read_json_gz($path); - @{ $stats[-1] }{qw(proto path)} = ( $proto, $path ); + @stats, + FileHelper::read_json_gz($path); + @{ $stats[-1] }{qw(proto path)} = ($proto, $path); } } @@ -714,7 +704,7 @@ package Foostats::Reporter { use HTML::Entities qw(encode_entities); sub truncate_url { - my ( $url, $max_length ) = @_; + my ($url, $max_length) = @_; $max_length //= 100; # Default to 100 characters return $url if length($url) <= $max_length; @@ -725,44 +715,44 @@ package Foostats::Reporter { my $available_length = $max_length - $ellipsis_length; # Split available length between start and end, favoring the end - my $keep_start = int( $available_length * 0.4 ); # 40% for start + my $keep_start = int($available_length * 0.4); # 40% for start my $keep_end = $available_length - $keep_start; # 60% for end - my $start = substr( $url, 0, $keep_start ); - my $end = substr( $url, -$keep_end ); + my $start = substr($url, 0, $keep_start); + my $end = substr($url, -$keep_end); return $start . $ellipsis . $end; } sub truncate_urls_for_table { - my ( $url_rows, $count_column_header ) = @_; + my ($url_rows, $count_column_header) = @_; # Calculate the maximum width needed for the count column my $max_count_width = length($count_column_header); for my $row (@$url_rows) { - my $count_width = length( $row->[1] ); + my $count_width = length($row->[1]); $max_count_width = $count_width if $count_width > $max_count_width; } # Row format: "| URL... | count |" with padding # Calculate: "| " (2) + URL + " | " (3) + count_with_padding + " |" (2) my $max_url_length = 100 - 7 - $max_count_width; - $max_url_length = 70 if $max_url_length > 70; # Cap at reasonable length + $max_url_length = 70 if $max_url_length > 70; # Cap at reasonable length # Truncate URLs in place for my $row (@$url_rows) { - $row->[0] = truncate_url( $row->[0], $max_url_length ); + $row->[0] = truncate_url($row->[0], $max_url_length); } } sub format_table { - my ( $headers, $rows ) = @_; + my ($headers, $rows) = @_; my @widths; - for my $col ( 0 .. $#{$headers} ) { - my $max_width = length( $headers->[$col] ); + for my $col (0 .. $#{$headers}) { + my $max_width = length($headers->[$col]); for my $row (@$rows) { - my $len = length( $row->[$col] ); + my $len = length($row->[$col]); $max_width = $len if $len > $max_width; } push @widths, $max_width; @@ -770,10 +760,10 @@ package Foostats::Reporter { my $header_line = '|'; my $separator_line = '|'; - for my $col ( 0 .. $#{$headers} ) { + for my $col (0 .. $#{$headers}) { $header_line .= - sprintf( " %-*s |", $widths[$col], $headers->[$col] ); - $separator_line .= '-' x ( $widths[$col] + 2 ) . '|'; + sprintf(" %-*s |", $widths[$col], $headers->[$col]); + $separator_line .= '-' x ($widths[$col] + 2) . '|'; } my @table_lines; @@ -783,33 +773,35 @@ package Foostats::Reporter { for my $row (@$rows) { my $row_line = '|'; - for my $col ( 0 .. $#{$row} ) { - $row_line .= sprintf( " %-*s |", $widths[$col], $row->[$col] ); + for my $col (0 .. $#{$row}) { + $row_line .= sprintf(" %-*s |", $widths[$col], $row->[$col]); } push @table_lines, $row_line; } push @table_lines, $separator_line; # Add bottom terminator - return join( "\n", @table_lines ); + return join("\n", @table_lines); } # Convert gemtext to HTML sub gemtext_to_html { - my ($content) = @_; - my $html = ""; - my $in_code_block = 0; - my $in_list = 0; - my @lines = split /\n/, $content; + my ($content) = @_; + my $html = ""; + my $in_code_block = 0; + my $in_list = 0; + my @lines = split /\n/, $content; my @code_block_lines = (); - + for my $line (@lines) { if ($line =~ /^```/) { if ($in_code_block) { + # End code block - check if it's a table if (is_ascii_table(\@code_block_lines)) { $html .= convert_ascii_table_to_html(\@code_block_lines); - } else { + } + else { $html .= "<pre>\n"; for my $code_line (@code_block_lines) { $html .= encode_entities($code_line) . "\n"; @@ -817,18 +809,19 @@ package Foostats::Reporter { $html .= "</pre>\n"; } @code_block_lines = (); - $in_code_block = 0; - } else { + $in_code_block = 0; + } + else { $in_code_block = 1; } next; } - + if ($in_code_block) { push @code_block_lines, $line; next; } - + # Skip 365-day summary section header in HTML output if ($line =~ /^## 365-Day Summary Reports\s*$/) { next; @@ -839,94 +832,106 @@ package Foostats::Reporter { $html .= "</ul>\n"; $in_list = 0; } - + # Headers if ($line =~ /^### (.*)/) { $html .= "<h3>" . encode_entities($1) . "</h3>\n"; - } elsif ($line =~ /^## (.*)/) { + } + elsif ($line =~ /^## (.*)/) { $html .= "<h2>" . encode_entities($1) . "</h2>\n"; - } elsif ($line =~ /^# (.*)/) { + } + elsif ($line =~ /^# (.*)/) { $html .= "<h1>" . encode_entities($1) . "</h1>\n"; } + # Links elsif ($line =~ /^=> (\S+)\s+(.*)/) { my ($url, $text) = ($1, $2); + # Drop 365-day summary links from HTML output if ($url =~ /(?:^|[\/.])365day_summary_\d{8}\.gmi$/) { next; } + # Convert .gmi links to .html $url =~ s/\.gmi$/\.html/; $html .= "<p><a href=\"" . encode_entities($url) . "\">" . encode_entities($text) . "</a></p>\n"; } + # Bullet points elsif ($line =~ /^\* (.*)/) { if (!$in_list) { $html .= "<ul>\n"; $in_list = 1; } - $html .= "<li>" . encode_entities($1) . "</li>\n"; + $html .= "<li>" . linkify_text($1) . "</li>\n"; } + # Empty line - skip to avoid excessive spacing elsif ($line =~ /^\s*$/) { + # Skip empty lines for more compact output } + # Regular text else { - $html .= "<p>" . encode_entities($line) . "</p>\n"; + $html .= "<p>" . linkify_text($line) . "</p>\n"; } } - + # Close list if still open if ($in_list) { $html .= "</ul>\n"; } - + return $html; } - + # Check if the lines form an ASCII table sub is_ascii_table { my ($lines) = @_; - return 0 if @$lines < 3; # Need at least header, separator, and one data row - + return 0 if @$lines < 3; # Need at least header, separator, and one data row + # Check for separator lines with dashes and pipes for my $line (@$lines) { return 1 if $line =~ /^\|?[\s\-]+\|/; } return 0; } - + # Convert ASCII table to HTML table sub convert_ascii_table_to_html { - my ($lines) = @_; - my $html = "<table>\n"; + my ($lines) = @_; + my $html = "<table>\n"; my $row_count = 0; - + for my $line (@$lines) { + # Skip separator lines next if $line =~ /^\|?[\s\-]+\|/ && $line =~ /\-/; - + # Parse table row my @cells = split /\s*\|\s*/, $line; - @cells = grep { length($_) > 0 } @cells; # Remove empty cells - + @cells = grep { length($_) > 0 } @cells; # Remove empty cells + if (@cells) { $html .= "<tr>\n"; + # First row is header my $tag = ($row_count == 0) ? "th" : "td"; for my $cell (@cells) { - $html .= " <$tag>" . encode_entities(trim($cell)) . "</$tag>\n"; + my $val = trim($cell); + $html .= " <$tag>" . linkify_text($val) . "</$tag>\n"; } $html .= "</tr>\n"; $row_count++; } } - + $html .= "</table>\n"; return $html; } - + # Trim whitespace from string sub trim { my ($str) = @_; @@ -934,9 +939,77 @@ package Foostats::Reporter { $str =~ s/\s+$//; return $str; } - + + # Build an href for a token that looks like a URL or FQDN + sub _guess_href { + my ($token) = @_; + my $t = $token; + $t =~ s/^\s+//; + $t =~ s/\s+$//; + + # Already absolute http(s) + return $t if $t =~ m{^https?://}i; + + # Extract trailing punctuation to avoid including it in href + my $trail = ''; + if ($t =~ s{([)\]\}.,;:!?]+)$}{}) { $trail = $1; } + + # host[/path] + if ($t =~ m{^([A-Za-z0-9.-]+\.[A-Za-z]{2,})(/[^\s<]*)?$}) { + my ($host, $path) = ($1, $2 // ''); + my $has_ellipsis = index($t, '...') != -1 || index(($path // ''), '...') != -1; + my $is_gemini = defined($path) && $path =~ /\.gmi(?:[?#].*)?$/i; + my $scheme = $is_gemini ? 'gemini' : 'https'; + + # If truncated, fall back to host root + my $href = + $has_ellipsis + ? sprintf('%s://%s/', $scheme, $host) + : sprintf('%s://%s%s', $scheme, $host, ($path eq '' ? '/' : $path)); + return ($href . $trail); + } + + return undef; + } + + # Turn any URLs/FQDNs in the provided text into anchors + sub linkify_text { + my ($text) = @_; + return '' unless defined $text; + + my $out = ''; + my $pos = 0; + while ($text =~ m{((?:https?://)?[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:/[^\s<]*)?)}g) { + my $match = $1; + my $start = $-[1]; + my $end = $+[1]; + + # Emit preceding text + $out .= encode_entities(substr($text, $pos, $start - $pos)); + + # Separate trailing punctuation from the match + my ($core, $trail) = ($match, ''); + if ($core =~ s{([)\]\}.,;:!?]+)$}{}) { $trail = $1; } + + my $href = _guess_href($core); + if ($href) { + $out .= sprintf('<a href="%s.html">%s</a>%s', + encode_entities($href), encode_entities($core), encode_entities($trail)); + } + else { + # Not a linkable token after all + $out .= encode_entities($match); + } + $pos = $end; + } + + # Remainder + $out .= encode_entities(substr($text, $pos)); + return $out; + } + # Use HTML::Entities::encode_entities imported above - + # Generate HTML wrapper sub generate_html_page { my ($title, $content) = @_; @@ -1006,48 +1079,45 @@ $content } sub report { - my ( $stats_dir, $output_dir, $html_output_dir, %merged ) = @_; - for my $date ( sort { $b cmp $a } keys %merged ) { + my ($stats_dir, $output_dir, $html_output_dir, %merged) = @_; + for my $date (sort { $b cmp $a } keys %merged) { my $stats = $merged{$date}; next unless $stats->{count}; - my ( $year, $month, $day ) = $date =~ /(\d{4})(\d{2})(\d{2})/; + my ($year, $month, $day) = $date =~ /(\d{4})(\d{2})(\d{2})/; # Check if .gmi file exists and its age based on date in filename - my $report_path = "$output_dir/$date.gmi"; + my $report_path = "$output_dir/$date.gmi"; my $html_report_path = "$output_dir/$date.html"; # Calculate age of the data based on date in filename my $today = Time::Piece->new(); - my $file_date = Time::Piece->strptime( $date, '%Y%m%d' ); - my $age_days = ( $today - $file_date ) / ( 24 * 60 * 60 ); + my $file_date = Time::Piece->strptime($date, '%Y%m%d'); + my $age_days = ($today - $file_date) / (24 * 60 * 60); - if ( -e $report_path && -e $html_report_path ) { + if (-e $report_path && -e $html_report_path) { # Files exist - if ( $age_days <= 3 ) { + if ($age_days <= 3) { # Data is recent (within 3 days), regenerate it - say -"Regenerating daily report for $year-$month-$day (data age: " - . sprintf( "%.1f", $age_days ) - . " days)"; + say "Regenerating daily report for $year-$month-$day (data age: " + . sprintf("%.1f", $age_days) + . " days)"; } else { # Data is old (older than 3 days), skip if files exist - say -"Skipping daily report for $year-$month-$day (files exist, data age: " - . sprintf( "%.1f", $age_days ) - . " days)"; + say "Skipping daily report for $year-$month-$day (files exist, data age: " + . sprintf("%.1f", $age_days) + . " days)"; next; } } else { # File doesn't exist, generate it - say -"Generating new daily report for $year-$month-$day (file doesn't exist, data age: " - . sprintf( "%.1f", $age_days ) - . " days)"; + say "Generating new daily report for $year-$month-$day (file doesn't exist, data age: " + . sprintf("%.1f", $age_days) + . " days)"; } my $report_content = ""; @@ -1057,27 +1127,23 @@ $content # Feed counts first $report_content .= "### Feed Statistics\n\n"; my @feed_rows; - push @feed_rows, [ 'Total', $stats->{feed_ips}{'Total'} // 0 ]; - push @feed_rows, - [ 'Gemini Gemfeed', $stats->{feed_ips}{'Gemini Gemfeed'} // 0 ]; - push @feed_rows, - [ 'Gemini Atom', $stats->{feed_ips}{'Gemini Atom'} // 0 ]; - push @feed_rows, - [ 'Web Gemfeed', $stats->{feed_ips}{'Web Gemfeed'} // 0 ]; - push @feed_rows, - [ 'Web Atom', $stats->{feed_ips}{'Web Atom'} // 0 ]; + push @feed_rows, [ 'Total', $stats->{feed_ips}{'Total'} // 0 ]; + push @feed_rows, [ 'Gemini Gemfeed', $stats->{feed_ips}{'Gemini Gemfeed'} // 0 ]; + push @feed_rows, [ 'Gemini Atom', $stats->{feed_ips}{'Gemini Atom'} // 0 ]; + push @feed_rows, [ 'Web Gemfeed', $stats->{feed_ips}{'Web Gemfeed'} // 0 ]; + push @feed_rows, [ 'Web Atom', $stats->{feed_ips}{'Web Atom'} // 0 ]; $report_content .= "```\n"; - $report_content .= - format_table( [ 'Feed Type', 'Count' ], \@feed_rows ); + $report_content .= format_table([ 'Feed Type', 'Count' ], \@feed_rows); $report_content .= "\n```\n\n"; + # Top 50 URLs next $report_content .= "### Top 50 URLs\n\n"; my @url_rows; my $urls = $stats->{page_ips}{urls}; my @sorted_urls = - sort { ( $urls->{$b} // 0 ) <=> ( $urls->{$a} // 0 ) } - keys %$urls; - my $truncated = @sorted_urls > 50; + sort { ($urls->{$b} // 0) <=> ($urls->{$a} // 0) } + keys %$urls; + my $truncated = @sorted_urls > 50; @sorted_urls = @sorted_urls[ 0 .. 49 ] if $truncated; for my $url (@sorted_urls) { @@ -1085,10 +1151,9 @@ $content } # Truncate URLs to fit within 100-character rows - truncate_urls_for_table( \@url_rows, 'Unique Visitors' ); + truncate_urls_for_table(\@url_rows, 'Unique Visitors'); $report_content .= "```\n"; - $report_content .= - format_table( [ 'URL', 'Unique Visitors' ], \@url_rows ); + $report_content .= format_table([ 'URL', 'Unique Visitors' ], \@url_rows); $report_content .= "\n```\n"; if ($truncated) { $report_content .= "\n... and more (truncated to 50 entries).\n"; @@ -1100,18 +1165,17 @@ $content my @host_rows; my $hosts = $stats->{page_ips}{hosts}; my @sorted_hosts = - sort { ( $hosts->{$b} // 0 ) <=> ( $hosts->{$a} // 0 ) } - keys %$hosts; + sort { ($hosts->{$b} // 0) <=> ($hosts->{$a} // 0) } + keys %$hosts; - $truncated = @sorted_hosts > 50; + $truncated = @sorted_hosts > 50; @sorted_hosts = @sorted_hosts[ 0 .. 49 ] if $truncated; for my $host (@sorted_hosts) { push @host_rows, [ $host, $hosts->{$host} // 0 ]; } $report_content .= "```\n"; - $report_content .= - format_table( [ 'Host', 'Unique Visitors' ], \@host_rows ); + $report_content .= format_table([ 'Host', 'Unique Visitors' ], \@host_rows); $report_content .= "\n```\n"; if ($truncated) { $report_content .= "\n... and more (truncated to 50 entries).\n"; @@ -1121,22 +1185,22 @@ $content # Summary last $report_content .= "### Summary\n\n"; my $total_requests = - ( $stats->{count}{gemini} // 0 ) + ( $stats->{count}{web} // 0 ); + ($stats->{count}{gemini} // 0) + ($stats->{count}{web} // 0); $report_content .= "* Total requests: $total_requests\n"; $report_content .= - "* Filtered requests: " . ( $stats->{count}{filtered} // 0 ) . "\n"; + "* Filtered requests: " . ($stats->{count}{filtered} // 0) . "\n"; $report_content .= - "* Gemini requests: " . ( $stats->{count}{gemini} // 0 ) . "\n"; + "* Gemini requests: " . ($stats->{count}{gemini} // 0) . "\n"; $report_content .= - "* Web requests: " . ( $stats->{count}{web} // 0 ) . "\n"; + "* Web requests: " . ($stats->{count}{web} // 0) . "\n"; $report_content .= - "* IPv4 requests: " . ( $stats->{count}{IPv4} // 0 ) . "\n"; + "* IPv4 requests: " . ($stats->{count}{IPv4} // 0) . "\n"; $report_content .= - "* IPv6 requests: " . ( $stats->{count}{IPv6} // 0 ) . "\n\n"; + "* IPv6 requests: " . ($stats->{count}{IPv6} // 0) . "\n\n"; # Add links to summary reports (only monthly) $report_content .= "## Related Reports\n\n"; - my $now = localtime; + my $now = localtime; my $current_date = $now->strftime('%Y%m%d'); $report_content .= "=> ./30day_summary_$current_date.gmi 30-Day Summary Report\n\n"; @@ -1145,29 +1209,29 @@ $content # $report_path already defined above say "Writing report to $report_path"; - FileHelper::write( $report_path, $report_content ); - + FileHelper::write($report_path, $report_content); + # Also write HTML version mkdir $html_output_dir unless -d $html_output_dir; - my $html_path = "$html_output_dir/$date.html"; + my $html_path = "$html_output_dir/$date.html"; my $html_content = gemtext_to_html($report_content); - my $html_page = generate_html_page("Stats for $year-$month-$day", $html_content); + my $html_page = generate_html_page("Stats for $year-$month-$day", $html_content); say "Writing HTML report to $html_path"; - FileHelper::write( $html_path, $html_page ); + FileHelper::write($html_path, $html_page); } # Generate summary reports - generate_summary_report( 30, $stats_dir, $output_dir, $html_output_dir, %merged ); - + generate_summary_report(30, $stats_dir, $output_dir, $html_output_dir, %merged); + # Generate index.gmi and index.html - generate_index( $output_dir, $html_output_dir ); + generate_index($output_dir, $html_output_dir); } sub generate_summary_report { - my ( $days, $stats_dir, $output_dir, $html_output_dir, %merged ) = @_; + my ($days, $stats_dir, $output_dir, $html_output_dir, %merged) = @_; # Get the last N days of dates - my @dates = sort { $b cmp $a } keys %merged; + my @dates = sort { $b cmp $a } keys %merged; my $max_index = $days - 1; @dates = @dates[ 0 .. $max_index ] if @dates > $days; @@ -1176,16 +1240,16 @@ $content # Build report content my $report_content = build_report_header($today, $days); + # Order: feed counts -> Top URLs -> daily top 3 for last 30 days -> other tables - $report_content .= build_feed_statistics_section( \@dates, \%merged ); + $report_content .= build_feed_statistics_section(\@dates, \%merged); # Aggregate and add top lists - my ( $all_hosts, $all_urls ) = - aggregate_hosts_and_urls( \@dates, \%merged ); + my ($all_hosts, $all_urls) = aggregate_hosts_and_urls(\@dates, \%merged); $report_content .= build_top_urls_section($all_urls, $days); $report_content .= build_top3_urls_last_n_days_per_day($stats_dir, 30, \%merged); $report_content .= build_top_hosts_section($all_hosts, $days); - $report_content .= build_daily_summary_section( \@dates, \%merged ); + $report_content .= build_daily_summary_section(\@dates, \%merged); # Add links to other summary reports $report_content .= build_summary_links($days, $report_date); @@ -1195,24 +1259,25 @@ $content my $report_path = "$output_dir/${days}day_summary_$report_date.gmi"; say "Writing $days-day summary report to $report_path"; - FileHelper::write( $report_path, $report_content ); - + FileHelper::write($report_path, $report_content); + # Also write HTML version, except for 365-day summaries (HTML suppressed) if ($days != 365) { mkdir $html_output_dir unless -d $html_output_dir; - my $html_path = "$html_output_dir/${days}day_summary_$report_date.html"; + my $html_path = "$html_output_dir/${days}day_summary_$report_date.html"; my $html_content = gemtext_to_html($report_content); - my $html_page = generate_html_page("$days-Day Summary Report", $html_content); + my $html_page = generate_html_page("$days-Day Summary Report", $html_content); say "Writing HTML $days-day summary report to $html_path"; - FileHelper::write( $html_path, $html_page ); - } else { + FileHelper::write($html_path, $html_page); + } + else { say "Skipping HTML generation for 365-day summary (Gemtext only)"; } } sub build_report_header { my ($today, $days) = @_; - $days //= 30; # Default to 30 days for backward compatibility + $days //= 30; # Default to 30 days for backward compatibility my $content = "# $days-Day Summary Report\n\n"; $content .= "Generated on " . $today->strftime('%Y-%m-%d') . "\n\n"; @@ -1220,74 +1285,65 @@ $content } sub build_daily_summary_section { - my ( $dates, $merged ) = @_; + my ($dates, $merged) = @_; my $content = "## Daily Summary Evolution (Last 30 Days)\n\n"; $content .= "### Total Requests by Day\n\n```\n"; my @summary_rows; - for my $date ( reverse @$dates ) { + for my $date (reverse @$dates) { my $stats = $merged->{$date}; next unless $stats->{count}; - push @summary_rows, build_daily_summary_row( $date, $stats ); + push @summary_rows, build_daily_summary_row($date, $stats); } - $content .= format_table( - [ 'Date', 'Filtered', 'Gemini', 'Web', 'IPv4', 'IPv6', 'Total' ], - \@summary_rows ); + $content .= format_table([ 'Date', 'Filtered', 'Gemini', 'Web', 'IPv4', 'IPv6', 'Total' ], \@summary_rows); $content .= "\n```\n\n"; return $content; } sub build_daily_summary_row { - my ( $date, $stats ) = @_; + my ($date, $stats) = @_; - my ( $year, $month, $day ) = $date =~ /(\d{4})(\d{2})(\d{2})/; + my ($year, $month, $day) = $date =~ /(\d{4})(\d{2})(\d{2})/; my $formatted_date = "$year-$month-$day"; my $total_requests = - ( $stats->{count}{gemini} // 0 ) + ( $stats->{count}{web} // 0 ); + ($stats->{count}{gemini} // 0) + ($stats->{count}{web} // 0); my $filtered = $stats->{count}{filtered} // 0; my $gemini = $stats->{count}{gemini} // 0; my $web = $stats->{count}{web} // 0; my $ipv4 = $stats->{count}{IPv4} // 0; my $ipv6 = $stats->{count}{IPv6} // 0; - return [ - $formatted_date, $filtered, - $gemini, $web, $ipv4, - $ipv6, $total_requests - ]; + return [ $formatted_date, $filtered, $gemini, $web, $ipv4, $ipv6, $total_requests ]; } sub build_feed_statistics_section { - my ( $dates, $merged ) = @_; + my ($dates, $merged) = @_; my $content = "### Feed Statistics Evolution\n\n```\n"; my @feed_rows; - for my $date ( reverse @$dates ) { + for my $date (reverse @$dates) { my $stats = $merged->{$date}; next unless $stats->{feed_ips}; - push @feed_rows, build_feed_statistics_row( $date, $stats ); + push @feed_rows, build_feed_statistics_row($date, $stats); } - $content .= format_table( - [ 'Date', 'Gem Feed', 'Gem Atom', 'Web Feed', 'Web Atom', 'Total' ], - \@feed_rows - ); + $content .= format_table([ 'Date', 'Gem Feed', 'Gem Atom', 'Web Feed', 'Web Atom', 'Total' ], \@feed_rows); $content .= "\n```\n\n"; return $content; } sub build_feed_statistics_row { - my ( $date, $stats ) = @_; + my ($date, $stats) = @_; - my ( $year, $month, $day ) = $date =~ /(\d{4})(\d{2})(\d{2})/; + my ($year, $month, $day) = $date =~ /(\d{4})(\d{2})(\d{2})/; my $formatted_date = "$year-$month-$day"; return [ @@ -1301,7 +1357,7 @@ $content } sub aggregate_hosts_and_urls { - my ( $dates, $merged ) = @_; + my ($dates, $merged) = @_; my %all_hosts; my %all_urls; @@ -1311,20 +1367,19 @@ $content next unless $stats->{page_ips}; # Aggregate hosts - while ( my ( $host, $count ) = each %{ $stats->{page_ips}{hosts} } ) - { + while (my ($host, $count) = each %{ $stats->{page_ips}{hosts} }) { $all_hosts{$host} //= 0; $all_hosts{$host} += $count; } # Aggregate URLs - while ( my ( $url, $count ) = each %{ $stats->{page_ips}{urls} } ) { + while (my ($url, $count) = each %{ $stats->{page_ips}{urls} }) { $all_urls{$url} //= 0; $all_urls{$url} += $count; } } - return ( \%all_hosts, \%all_urls ); + return (\%all_hosts, \%all_urls); } sub build_top_hosts_section { @@ -1335,14 +1390,14 @@ $content my @host_rows; my @sorted_hosts = - sort { $all_hosts->{$b} <=> $all_hosts->{$a} } keys %$all_hosts; + sort { $all_hosts->{$b} <=> $all_hosts->{$a} } keys %$all_hosts; @sorted_hosts = @sorted_hosts[ 0 .. 49 ] if @sorted_hosts > 50; for my $host (@sorted_hosts) { push @host_rows, [ $host, $all_hosts->{$host} ]; } - $content .= format_table( [ 'Host', 'Visitors' ], \@host_rows ); + $content .= format_table([ 'Host', 'Visitors' ], \@host_rows); $content .= "\n```\n\n"; return $content; @@ -1356,7 +1411,7 @@ $content my @url_rows; my @sorted_urls = - sort { $all_urls->{$b} <=> $all_urls->{$a} } keys %$all_urls; + sort { $all_urls->{$b} <=> $all_urls->{$a} } keys %$all_urls; @sorted_urls = @sorted_urls[ 0 .. 49 ] if @sorted_urls > 50; for my $url (@sorted_urls) { @@ -1364,18 +1419,19 @@ $content } # Truncate URLs to fit within 100-character rows - truncate_urls_for_table( \@url_rows, 'Visitors' ); + truncate_urls_for_table(\@url_rows, 'Visitors'); - $content .= format_table( [ 'URL', 'Visitors' ], \@url_rows ); + $content .= format_table([ 'URL', 'Visitors' ], \@url_rows); $content .= "\n```\n\n"; return $content; } sub build_summary_links { - my ( $current_days, $report_date ) = @_; + my ($current_days, $report_date) = @_; my $content = ''; + # Only add link to 30-day summary when not on the 30-day report itself if ($current_days != 30) { $content .= "## Other Summary Reports\n\n"; @@ -1385,17 +1441,18 @@ $content return $content; } -sub build_top3_urls_last_n_days_per_day { - my ($stats_dir, $days, $merged) = @_; - $days //= 30; - my $content = "## Top 5 URLs Per Day (Last ${days} Days)\n\n"; + sub build_top3_urls_last_n_days_per_day { + my ($stats_dir, $days, $merged) = @_; + $days //= 30; + my $content = "## Top 5 URLs Per Day (Last ${days} Days)\n\n"; - my @all = DateHelper::last_month_dates(); - my @dates = @all; - @dates = @all[0 .. $days-1] if @all > $days; - return $content . "(no data)\n\n" unless @dates; + my @all = DateHelper::last_month_dates(); + my @dates = @all; + @dates = @all[ 0 .. $days - 1 ] if @all > $days; + return $content . "(no data)\n\n" unless @dates; for my $date (@dates) { + # Prefer in-memory merged stats if available; otherwise merge from disk my $stats = $merged->{$date}; if (!$stats || !($stats->{page_ips} && $stats->{page_ips}{urls})) { @@ -1403,24 +1460,24 @@ sub build_top3_urls_last_n_days_per_day { } next unless $stats && $stats->{page_ips} && $stats->{page_ips}{urls}; - my ($y,$m,$d) = $date =~ /(\d{4})(\d{2})(\d{2})/; + my ($y, $m, $d) = $date =~ /(\d{4})(\d{2})(\d{2})/; $content .= "### $y-$m-$d\n\n"; - my $urls = $stats->{page_ips}{urls}; - my @sorted = sort { ($urls->{$b}//0) <=> ($urls->{$a}//0) } keys %$urls; + my $urls = $stats->{page_ips}{urls}; + my @sorted = sort { ($urls->{$b} // 0) <=> ($urls->{$a} // 0) } keys %$urls; next unless @sorted; my $limit = @sorted < 5 ? @sorted : 5; - @sorted = @sorted[0..$limit-1]; + @sorted = @sorted[ 0 .. $limit - 1 ]; my @rows; for my $u (@sorted) { push @rows, [ $u, $urls->{$u} // 0 ]; } - truncate_urls_for_table( \@rows, 'Visitors' ); + truncate_urls_for_table(\@rows, 'Visitors'); $content .= "```\n" . format_table([ 'URL', 'Visitors' ], \@rows) . "\n```\n\n"; + } + + return $content; } - return $content; -} - sub generate_index { my ($output_dir, $html_output_dir) = @_; @@ -1430,13 +1487,14 @@ sub build_top3_urls_last_n_days_per_day { closedir($dh); my @summaries_30day = sort { $b cmp $a } grep { /^30day_summary_/ } @gmi_files; - my $latest_30 = $summaries_30day[0]; + my $latest_30 = $summaries_30day[0]; my $index_path = "$output_dir/index.gmi"; mkdir $html_output_dir unless -d $html_output_dir; my $html_path = "$html_output_dir/index.html"; if ($latest_30) { + # Read 30-day summary content and use it as index my $summary_path = "$output_dir/$latest_30"; open my $sfh, '<', $summary_path or die "$summary_path: $!"; @@ -1457,9 +1515,10 @@ sub build_top3_urls_last_n_days_per_day { close $hh; say "Writing HTML index to $html_path (copy of $latest_html)"; FileHelper::write($html_path, $html_page); - } else { + } + else { my $html_content = gemtext_to_html($content); - my $html_page = generate_html_page("30-Day Summary Report", $html_content); + my $html_page = generate_html_page("30-Day Summary Report", $html_content); say "Writing HTML index to $html_path (from gemtext)"; FileHelper::write($html_path, $html_page); } @@ -1472,18 +1531,18 @@ sub build_top3_urls_last_n_days_per_day { FileHelper::write($index_path, $fallback); my $html_content = gemtext_to_html($fallback); - my $html_page = generate_html_page("Foostats Reports Index", $html_content); + my $html_page = generate_html_page("Foostats Reports Index", $html_content); say "Writing fallback HTML index to $html_path"; FileHelper::write($html_path, $html_page); } } package main; - use Getopt::Long; - use Sys::Hostname; +use Getopt::Long; +use Sys::Hostname; - sub usage { - print <<~"USAGE"; +sub usage { + print <<~"USAGE"; Usage: $0 [options] Options: @@ -1506,73 +1565,72 @@ package main; --version Show version information. --help Show this help message. USAGE - exit 0; - } + exit 0; +} - sub parse_logs ( $stats_dir, $odds_file, $odds_log ) { - my $out = Foostats::FileOutputter->new( stats_dir => $stats_dir ); +sub parse_logs ($stats_dir, $odds_file, $odds_log) { + my $out = Foostats::FileOutputter->new(stats_dir => $stats_dir); - $out->{stats} = Foostats::Logreader::parse_logs( - $out->last_processed_date('web'), - $out->last_processed_date('gemini'), - $odds_file, $odds_log - ); + $out->{stats} = Foostats::Logreader::parse_logs( + $out->last_processed_date('web'), + $out->last_processed_date('gemini'), + $odds_file, $odds_log + ); - $out->write; - } + $out->write; +} - sub foostats_main { - my ( $parse_logs, $replicate, $report, $all, $help, $version ); - - # With default values - my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; - my $odds_file = $stats_dir . '/fooodds.txt'; - my $odds_log = '/var/log/fooodds'; - my $output_dir; # Will default to $stats_dir/gemtext if not specified - my $html_output_dir; # Will default to /var/www/htdocs/gemtexter/stats.foo.zone if not specified - my $partner_node = - hostname eq 'fishfinger.buetow.org' - ? 'blowfish.buetow.org' - : 'fishfinger.buetow.org'; - - GetOptions - 'parse-logs!' => \$parse_logs, - 'filter-log=s' => \$odds_log, - 'odds-file=s' => \$odds_file, - 'replicate!' => \$replicate, - 'report!' => \$report, - 'all!' => \$all, - 'stats-dir=s' => \$stats_dir, - 'output-dir=s' => \$output_dir, - 'html-output-dir=s' => \$html_output_dir, - 'partner-node=s' => \$partner_node, - 'version' => \$version, - 'help|?' => \$help; - - if ($version) { - print "foostats " . VERSION . "\n"; - exit 0; - } +sub foostats_main { + my ($parse_logs, $replicate, $report, $all, $help, $version); + + # With default values + my $stats_dir = '/var/www/htdocs/buetow.org/self/foostats'; + my $odds_file = $stats_dir . '/fooodds.txt'; + my $odds_log = '/var/log/fooodds'; + my $output_dir; # Will default to $stats_dir/gemtext if not specified + my $html_output_dir; # Will default to /var/www/htdocs/gemtexter/stats.foo.zone if not specified + my $partner_node = + hostname eq 'fishfinger.buetow.org' + ? 'blowfish.buetow.org' + : 'fishfinger.buetow.org'; + + GetOptions + 'parse-logs!' => \$parse_logs, + 'filter-log=s' => \$odds_log, + 'odds-file=s' => \$odds_file, + 'replicate!' => \$replicate, + 'report!' => \$report, + 'all!' => \$all, + 'stats-dir=s' => \$stats_dir, + 'output-dir=s' => \$output_dir, + 'html-output-dir=s' => \$html_output_dir, + 'partner-node=s' => \$partner_node, + 'version' => \$version, + 'help|?' => \$help; + + if ($version) { + print "foostats " . VERSION . "\n"; + exit 0; + } - usage() if $help; + usage() if $help; - parse_logs( $stats_dir, $odds_file, $odds_log ) - if $parse_logs - or $all; + parse_logs($stats_dir, $odds_file, $odds_log) + if $parse_logs + or $all; - Foostats::Replicator::replicate( $stats_dir, $partner_node ) - if $replicate - or $all; + Foostats::Replicator::replicate($stats_dir, $partner_node) + if $replicate + or $all; - # Set default output directories if not specified - $output_dir //= '/var/gemini/stats.foo.zone'; - $html_output_dir //= '/var/www/htdocs/gemtexter/stats.foo.zone'; + # Set default output directories if not specified + $output_dir //= '/var/gemini/stats.foo.zone'; + $html_output_dir //= '/var/www/htdocs/gemtexter/stats.foo.zone'; - Foostats::Reporter::report( $stats_dir, $output_dir, $html_output_dir, - Foostats::Merger::merge($stats_dir) ) - if $report - or $all; - } + Foostats::Reporter::report($stats_dir, $output_dir, $html_output_dir, Foostats::Merger::merge($stats_dir)) + if $report + or $all; +} - # Only run main flow when executed as a script, not when required (e.g., tests) - foostats_main() unless caller; +# Only run main flow when executed as a script, not when required (e.g., tests) +foostats_main() unless caller; diff --git a/t/tmp_filter_log b/t/tmp_filter_log index 3b07060..1b549e8 100644 --- a/t/tmp_filter_log +++ b/t/tmp_filter_log @@ -23,3 +23,6 @@ OK: /gemfeed/index.gmi appears fine... OK: /index.html appears fine... OK: /index.html appears fine... WARN: same blocked due to excessive requesting... +OK: /index.html appears fine... +OK: /some/really/long/path/with/many/segments/and/query/parts/that/could/be/truncated/when/rendered/for/display/example.html appears fine... +WARN: kztSAIRs7AaVHRqGWS7so4NFHc3bsTmarHBZvMDO8IreNJYorMG0T4VtDO7g32NwLamuggahn8zDDaQ8zptcug blocked due to excessive requesting... |
