#!/usr/bin/perl use v5.38; # Those are enabled automatically now w/ this version of Perl # use strict; # use warnings; use builtin qw(true false); use experimental qw(builtin); use feature qw(refaliasing); no warnings qw(experimental::refaliasing); # Debugging aids like diagnostics are noisy in production. # Removed per review: enable locally when debugging only. use constant VERSION => 'v0.2.0'; # Package: FileHelper — small file/JSON helpers # - Purpose: Atomic writes, gzip JSON read/write, and line reading. # - Notes: Dies on I/O errors; JSON encoding uses core JSON. package FileHelper { use JSON; # Sub: write # - Purpose: Atomic write to a file via "$path.tmp" and rename. # - Params: $path (str) destination; $content (str) contents to write. # - Return: undef; dies on failure. sub write ($path, $content) { open my $fh, '>', "$path.tmp" or die "\nCannot open file: $!"; print $fh $content; close $fh; rename "$path.tmp", $path; } # Sub: write_json_gz # - Purpose: JSON-encode $data and write it gzipped atomically. # - Params: $path (str) destination path; $data (ref/scalar) Perl data. # - Return: undef; dies on failure. sub write_json_gz ($path, $data) { my $json = encode_json $data; say "Writing $path"; open my $fd, '>:gzip', "$path.tmp" or die "$path.tmp: $!"; print $fd $json; close $fd; rename "$path.tmp", $path or die "$path.tmp: $!"; } # Sub: read_json_gz # - Purpose: Read a gzipped JSON file and decode to Perl data. # - Params: $path (str) path to .json.gz file. # - Return: Perl data structure. sub read_json_gz ($path) { say "Reading $path"; open my $fd, '<:gzip', $path or die "$path: $!"; my $json = decode_json <$fd>; close $fd; return $json; } # Sub: read_lines # - Purpose: Slurp file lines and chomp newlines. # - Params: $path (str) file path. # - Return: list of lines (no trailing newlines). sub read_lines ($path) { my @lines; open(my $fh, '<', $path) or die "$path: $!"; chomp(@lines = <$fh>); close($fh); return @lines; } } # Package: DateHelper — date range helpers # - Purpose: Produce date strings used for report windows. # - Format: Dates are returned as YYYYMMDD strings. package DateHelper { use Time::Piece; # Sub: last_month_dates # - Purpose: Return dates for today back to 30 days ago (inclusive). # - Params: none. # - Return: list of YYYYMMDD strings, newest first. sub last_month_dates () { my $today = localtime; my @dates; for my $days_ago (1 .. 31) { my $date = $today - ($days_ago * 24 * 60 * 60); push @dates, $date->strftime('%Y%m%d'); } return @dates; } } # Package: Foostats::Logreader — parse and normalize logs # - Purpose: Read web and gemini logs, anonymize IPs, and emit normalized events. # - Output Event: { proto, host, ip_hash, ip_proto, date, time, uri_path, status } package Foostats::Logreader { use Digest::SHA3 'sha3_512_base64'; use File::stat; use PerlIO::gzip; use Time::Piece; use String::Util qw(contains startswith endswith); # Make log locations configurable (env overrides) to enable testing. # Sub: gemini_logs_glob # - Purpose: Glob for gemini-related logs; env override for testing. # - Return: glob pattern string. sub gemini_logs_glob { $ENV{FOOSTATS_GEMINI_LOGS_GLOB} // '/var/log/daemon*' } # Sub: web_logs_glob # - Purpose: Glob for web access logs; env override for testing. # - Return: glob pattern string. sub web_logs_glob { $ENV{FOOSTATS_WEB_LOGS_GLOB} // '/var/www/logs/access.log*' } # Sub: anonymize_ip # - Purpose: Classify IPv4/IPv6 and map IP to a stable SHA3-512 base64 hash. # - Params: $ip (str) source IP. # - Return: ($hash, $proto) where $proto is 'IPv4' or 'IPv6'. sub anonymize_ip ($ip) { my $ip_proto = contains($ip, ':') ? 'IPv6' : 'IPv4'; my $ip_hash = sha3_512_base64 $ip; return ($ip_hash, $ip_proto); } # Sub: read_lines # - Purpose: Iterate files matching glob by age; invoke $cb for each line. # - Params: $glob (str) file glob; $cb (code) callback ($year, @fields). # - Return: undef; stops early if callback returns undef for a file. sub read_lines ($glob, $cb) { my sub year ($path) { localtime((stat $path)->mtime)->strftime('%Y'); } my sub open_file ($path) { my $flag = $path =~ /\.gz$/ ? '<:gzip' : '<'; open my $fd, $flag, $path or die "$path: $!"; return $fd; } my $last = false; say 'File path glob matches: ' . join(' ', glob $glob); LAST: for my $path (sort { -M $a <=> -M $b } glob $glob) { say "Processing $path"; my $file = open_file $path; my $year = year $file; while (<$file>) { next if contains($_, 'logfile turned over'); # last == true means: After this file, don't process more $last = true unless defined $cb->($year, split / +/); } say "Closing $path (last:$last)"; close $file; last LAST if $last; } } # Sub: parse_web_logs # - Purpose: Parse web log lines into normalized events and pass to callback. # - Params: $last_processed_date (YYYYMMDD int) lower bound; $cb (code) event consumer. # - Return: undef. sub parse_web_logs ($last_processed_date, $cb) { my sub parse_date ($date) { my $t = Time::Piece->strptime($date, '[%d/%b/%Y:%H:%M:%S'); return ($t->strftime('%Y%m%d'), $t->strftime('%H%M%S')); } my sub parse_web_line (@line) { my ($date, $time) = parse_date $line [4]; return undef if $date < $last_processed_date; # X-Forwarded-For? my $ip = $line[-2] eq '-' ? $line[1] : $line[-2]; my ($ip_hash, $ip_proto) = anonymize_ip $ip; return { proto => 'web', host => $line[0], ip_hash => $ip_hash, ip_proto => $ip_proto, date => $date, time => $time, uri_path => $line[7], status => $line[9], }; } read_lines web_logs_glob(), sub ($year, @line) { $cb->(parse_web_line @line); }; } # Sub: parse_gemini_logs # - Purpose: Parse vger/relayd lines, merge paired entries, and emit events. # - Params: $last_processed_date (YYYYMMDD int); $cb (code) event consumer. # - Return: undef. sub parse_gemini_logs ($last_processed_date, $cb) { my sub parse_date ($year, @line) { my $timestr = "$line[0] $line[1]"; return Time::Piece->strptime($timestr, '%b %d')->strftime("$year%m%d"); } my sub parse_vger_line ($year, @line) { my $full_path = $line[5]; $full_path =~ s/"//g; my ($proto, undef, $host, $uri_path) = split '/', $full_path, 4; $uri_path = '' unless defined $uri_path; return { proto => 'gemini', host => $host, uri_path => "/$uri_path", status => $line[6], date => int(parse_date($year, @line)), time => $line[2], }; } my sub parse_relayd_line ($year, @line) { my $date = int(parse_date($year, @line)); my ($ip_hash, $ip_proto) = anonymize_ip $line [12]; return { ip_hash => $ip_hash, ip_proto => $ip_proto, date => $date, time => $line[2], }; } # Expect one vger and one relayd log line per event! So collect # both events (one from one log line each) and then merge the result hash! my ($vger, $relayd); read_lines gemini_logs_glob(), sub ($year, @line) { if ($line[4] eq 'vger:') { $vger = parse_vger_line $year, @line; } elsif ($line[5] eq 'relay' and startswith($line[6], 'gemini')) { $relayd = parse_relayd_line $year, @line; return undef if $relayd->{date} < $last_processed_date; } if (defined $vger and defined $relayd and $vger->{time} eq $relayd->{time}) { $cb->({ %$vger, %$relayd }); $vger = $relayd = undef; } true; }; } # Sub: parse_logs # - Purpose: Coordinate parsing for both web and gemini, aggregating into stats. # - Params: $last_web_date, $last_gemini_date (YYYYMMDD int), $odds_file, $odds_log. # - Return: stats hashref keyed by "proto_YYYYMMDD". sub parse_logs ($last_web_date, $last_gemini_date, $odds_file, $odds_log) { my $agg = Foostats::Aggregator->new($odds_file, $odds_log); say "Last web date: $last_web_date"; say "Last gemini date: $last_gemini_date"; parse_web_logs $last_web_date, sub ($event) { $agg->add($event); }; parse_gemini_logs $last_gemini_date, sub ($event) { $agg->add($event); }; return $agg->{stats}; } } # Package: Foostats::Filter — request filtering and logging # - Purpose: Identify odd URI patterns and excessive requests per second per IP. # - Notes: Maintains an in-process blocklist for the current run. package Foostats::Filter { use String::Util qw(contains startswith endswith); # Sub: new # - Purpose: Construct a filter with odd patterns and a log path. # - Params: $odds_file (str) pattern list; $log_path (str) append-only log file. # - Return: blessed Foostats::Filter instance. sub new ($class, $odds_file, $log_path) { say "Logging filter to $log_path"; my @odds = FileHelper::read_lines($odds_file); bless { odds => \@odds, log_path => $log_path }, $class; } # Sub: ok # - Purpose: Check if an event passes filters; updates block state/logging. # - Params: $event (hashref) normalized request. # - Return: true if allowed; false if blocked. sub ok ($self, $event) { state %blocked = (); return false if exists $blocked{ $event->{ip_hash} }; if ($self->odd($event) or $self->excessive($event)) { ($blocked{ $event->{ip_hash} } //= 0)++; return false; } else { return true; } } # Sub: odd # - Purpose: Match URI path against user-provided odd patterns (substring match). # - Params: $event (hashref) with uri_path. # - Return: true if odd (blocked), false otherwise. sub odd ($self, $event) { \my $uri_path = \$event->{uri_path}; for ($self->{odds}->@*) { next if !defined $_ || $_ eq '' || /^\s*#/; next unless contains($uri_path, $_); $self->log('WARN', $uri_path, "contains $_ and is odd and will therefore be blocked!"); return true; } $self->log('OK', $uri_path, "appears fine..."); return false; } # Sub: log # - Purpose: Deduplicated append-only logging for filter decisions. # - Params: $severity (OK|WARN), $subject (str), $message (str). # - Return: undef. sub log ($self, $severity, $subject, $message) { state %dedup; # Don't log if path was already logged return if exists $dedup{$subject}; $dedup{$subject} = 1; open(my $fh, '>>', $self->{log_path}) or die $self->{log_path} . ": $!"; print $fh "$severity: $subject $message\n"; close($fh); } # Sub: excessive # - Purpose: Block if an IP makes more than one request within the same second. # - Params: $event (hashref) with time and ip_hash. # - Return: true if blocked; false otherwise. sub excessive ($self, $event) { \my $time = \$event->{time}; \my $ip_hash = \$event->{ip_hash}; state $last_time = $time; # Time with second: 'HH:MM:SS' state %count = (); # IPs accessing within the same second! if ($last_time ne $time) { $last_time = $time; %count = (); return false; } # IP requested site more than once within the same second!? if (1 < ++($count{$ip_hash} //= 0)) { $self->log('WARN', $ip_hash, "blocked due to excessive requesting..."); return true; } return false; } } # Package: Foostats::Aggregator — in-memory stats builder # - Purpose: Apply filters and accumulate counts, unique IPs per feed/page. package Foostats::Aggregator { use String::Util qw(contains startswith endswith); use constant { ATOM_FEED_URI => '/gemfeed/atom.xml', GEMFEED_URI => '/gemfeed/index.gmi', GEMFEED_URI_2 => '/gemfeed/', }; # Sub: new # - Purpose: Construct aggregator with a filter and empty stats store. # - Params: $odds_file (str), $odds_log (str). # - Return: Foostats::Aggregator instance. sub new ($class, $odds_file, $odds_log) { bless { filter => Foostats::Filter->new($odds_file, $odds_log), stats => {} }, $class; } # Sub: add # - Purpose: Apply filter, update counts and unique-IP sets, and return event. # - Params: $event (hashref) normalized event; ignored if undef. # - Return: $event; filtered events increment filtered count only. sub add ($self, $event) { return undef unless defined $event; my $date = $event->{date}; my $date_key = $event->{proto} . "_$date"; # Stats data model per protocol+day (key: "proto_YYYYMMDD"): # - count: per-proto request count, per IP version, and filtered count # - feed_ips: unique IPs per feed type (atom_feed, gemfeed) # - page_ips: unique IPs per host and per URL $self->{stats}{$date_key} //= { count => { filtered => 0, }, feed_ips => { atom_feed => {}, gemfeed => {}, }, page_ips => { hosts => {}, urls => {}, }, }; \my $s = \$self->{stats}{$date_key}; unless ($self->{filter}->ok($event)) { $s->{count}{filtered}++; return $event; } $self->add_count($s, $event); $self->add_page_ips($s, $event) unless $self->add_feed_ips($s, $event); return $event; } # Sub: add_count # - Purpose: Increment totals by protocol and IP version. # - Params: $stats (hashref) date bucket; $event (hashref). # - Return: undef. sub add_count ($self, $stats, $event) { \my $c = \$stats->{count}; \my $e = \$event; ($c->{ $e->{proto} } //= 0)++; ($c->{ $e->{ip_proto} } //= 0)++; } # Sub: add_feed_ips # - Purpose: If event hits feed endpoints, add unique IP and short-circuit. # - Params: $stats (hashref), $event (hashref). # - Return: 1 if feed matched; 0 otherwise. sub add_feed_ips ($self, $stats, $event) { \my $f = \$stats->{feed_ips}; \my $e = \$event; # Atom feed (exact path match, allow optional query string) if ($e->{uri_path} =~ m{^/gemfeed/atom\.xml(?:[?#].*)?$}) { ($f->{atom_feed}->{ $e->{ip_hash} } //= 0)++; return 1; } # Gemfeed index: '/gemfeed/' or '/gemfeed/index.gmi' (optionally with query) if ($e->{uri_path} =~ m{^/gemfeed/(?:index\.gmi)?(?:[?#].*)?$}) { ($f->{gemfeed}->{ $e->{ip_hash} } //= 0)++; return 1; } return 0; } # Sub: add_page_ips # - Purpose: Track unique IPs per host and per URL for .html/.gmi pages. # - Params: $stats (hashref), $event (hashref). # - Return: undef. sub add_page_ips ($self, $stats, $event) { \my $e = \$event; \my $p = \$stats->{page_ips}; return if !endswith($e->{uri_path}, '.html') && !endswith($e->{uri_path}, '.gmi'); ($p->{hosts}->{ $e->{host} }->{ $e->{ip_hash} } //= 0)++; ($p->{urls}->{ $e->{host} . $e->{uri_path} }->{ $e->{ip_hash} } //= 0)++; } } # Package: Foostats::FileOutputter — write per-day stats to disk # - Purpose: Persist aggregated stats to gzipped JSON files under a stats dir. package Foostats::FileOutputter { use JSON; use Sys::Hostname; use PerlIO::gzip; # Sub: new # - Purpose: Create outputter with stats_dir; ensures directory exists. # - Params: %args (hash) must include stats_dir. # - Return: Foostats::FileOutputter instance. sub new ($class, %args) { my $self = bless \%args, $class; mkdir $self->{stats_dir} or die $self->{stats_dir} . ": $!" unless -d $self->{stats_dir}; return $self; } # Sub: last_processed_date # - Purpose: Determine the most recent processed date for a protocol for this host. # - Params: $proto (str) 'web' or 'gemini'. # - Return: YYYYMMDD int (0 if none found). sub last_processed_date ($self, $proto) { my $hostname = hostname(); my @processed = glob $self->{stats_dir} . "/${proto}_????????.$hostname.json.gz"; my ($date) = @processed ? ($processed[-1] =~ /_(\d{8})\.$hostname\.json.gz/) : 0; return int($date); } # Sub: write # - Purpose: Write one gzipped JSON file per date bucket to stats_dir. # - Params: none (uses $self->{stats}). # - Return: undef. sub write ($self) { $self->for_dates( sub ($self, $date_key, $stats) { my $hostname = hostname(); my $path = $self->{stats_dir} . "/${date_key}.$hostname.json.gz"; FileHelper::write_json_gz $path, $stats; } ); } # Sub: for_dates # - Purpose: Iterate date-keyed stats in sorted order and call $cb. # - Params: $cb (code) receives ($self, $date_key, $stats). # - Return: undef. sub for_dates ($self, $cb) { $cb->($self, $_, $self->{stats}{$_}) for sort keys $self->{stats}->%*; } } # Package: Foostats::Replicator — pull partner stats files over HTTP(S) # - Purpose: Fetch recent partner node stats into local stats dir. package Foostats::Replicator { use JSON; use File::Basename; use LWP::UserAgent; use String::Util qw(endswith); # Sub: replicate # - Purpose: For each proto and last 31 days, replicate newest files. # - Params: $stats_dir (str) local dir; $partner_node (str) hostname. # - Return: undef (best-effort fetches). sub replicate ($stats_dir, $partner_node) { say "Replicating from $partner_node"; for my $proto (qw(gemini web)) { my $count = 0; for my $date (DateHelper::last_month_dates) { my $file_base = "${proto}_${date}"; my $dest_path = "${file_base}.$partner_node.json.gz"; replicate_file( "https://$partner_node/foostats/$dest_path", "$stats_dir/$dest_path", $count++ < 3, # Always replicate the newest 3 files. ); } } } # Sub: replicate_file # - Purpose: Download a single URL to a destination unless already present (unless forced). # - Params: $remote_url (str) source; $dest_path (str) destination; $force (bool/int). # - Return: undef; logs failures. sub replicate_file ($remote_url, $dest_path, $force) { # $dest_path already exists, not replicating it return if !$force && -f $dest_path; say "Replicating $remote_url to $dest_path (force:$force)... "; my $response = LWP::UserAgent->new->get($remote_url); unless ($response->is_success) { say "\nFailed to fetch the file: " . $response->status_line; return; } FileHelper::write $dest_path, $response->decoded_content; say 'done'; } } # Package: Foostats::Merger — merge per-host daily stats into a single view # - Purpose: Merge multiple node files per day into totals and unique counts. package Foostats::Merger { # Sub: merge # - Purpose: Produce merged stats for the last month (date => stats hashref). # - Params: $stats_dir (str) directory with daily gz JSON files. # - Return: hash (not ref) of date => merged stats. sub merge ($stats_dir) { my %merge; $merge{$_} = merge_for_date($stats_dir, $_) for DateHelper::last_month_dates; return %merge; } # Sub: merge_for_date # - Purpose: Merge all node files for a specific date into one stats hashref. # - Params: $stats_dir (str), $date (YYYYMMDD str/int). # - Return: { feed_ips => {...}, count => {...}, page_ips => {...} }. sub merge_for_date ($stats_dir, $date) { printf "Merging for date %s\n", $date; my @stats = stats_for_date($stats_dir, $date); return { feed_ips => feed_ips(@stats), count => count(@stats), page_ips => page_ips(@stats), }; } # Sub: merge_ips # - Purpose: Deep-ish merge helper: sums numbers, merges hash-of-hash counts. # - Params: $a (hashref target), $b (hashref source), $key_transform (code|undef). # - Return: undef; updates $a in place; dies on incompatible types. sub merge_ips ($a, $b, $key_transform = undef) { my sub merge ($a, $b) { while (my ($key, $val) = each %$b) { $a->{$key} //= 0; $a->{$key} += $val; } } my $is_num = qr/^\d+(\.\d+)?$/; while (my ($key, $val) = each %$b) { $key = $key_transform->($key) if defined $key_transform; if (not exists $a->{$key}) { $a->{$key} = $val; } elsif (ref($a->{$key}) eq 'HASH' && ref($val) eq 'HASH') { merge($a->{$key}, $val); } elsif ($a->{$key} =~ $is_num && $val =~ $is_num) { $a->{$key} += $val; } else { die "Not merging tkey '%s' (ref:%s): '%s' (ref:%s) with '%s' (ref:%s)\n", $key, ref($key), $a->{$key}, ref($a->{$key}), $val, ref($val); } } } # Sub: feed_ips # - Purpose: Merge feed unique-IP sets from per-proto stats into totals. # - Params: @stats (list of stats hashrefs) each with {proto, feed_ips}. # - Return: hashref with Total and per-proto feed counts. sub feed_ips (@stats) { my (%gemini, %web); for my $stats (@stats) { my $merge = $stats->{proto} eq 'web' ? \%web : \%gemini; printf "Merging proto %s feed IPs\n", $stats->{proto}; merge_ips($merge, $stats->{feed_ips}); } my %total; merge_ips(\%total, $web{$_}) for keys %web; merge_ips(\%total, $gemini{$_}) for keys %gemini; my %merge = ( 'Total' => scalar keys %total, 'Gemini Gemfeed' => scalar keys $gemini{gemfeed}->%*, 'Gemini Atom' => scalar keys $gemini{atom_feed}->%*, 'Web Gemfeed' => scalar keys $web{gemfeed}->%*, 'Web Atom' => scalar keys $web{atom_feed}->%*, ); return \%merge; } # Sub: count # - Purpose: Sum request counters across stats for the day. # - Params: @stats (list of stats hashrefs) each with {count}. # - Return: hashref of summed counters. sub count (@stats) { my %merge; for my $stats (@stats) { while (my ($key, $val) = each $stats->{count}->%*) { $merge{$key} //= 0; $merge{$key} += $val; } } return \%merge; } # Sub: page_ips # - Purpose: Merge unique IPs per host and per URL; coalesce truncated endings. # - Params: @stats (list of stats hashrefs) with {page_ips}{urls,hosts}. # - Return: hashref with urls/hosts each mapping => unique counts. sub page_ips (@stats) { my %merge = ( urls => {}, hosts => {} ); for my $key (keys %merge) { merge_ips( $merge{$key}, $_->{page_ips}->{$key}, sub ($key) { $key =~ s/\.gmi$/\.html/; $key; } ) for @stats; # Keep only uniq IP count $merge{$key}->{$_} = scalar keys $merge{$key}->{$_}->%* for keys $merge{$key}->%*; } return \%merge; } # Sub: stats_for_date # - Purpose: Load all stats files for a date across protos; tag proto/path. # - Params: $stats_dir (str), $date (YYYYMMDD). # - Return: list of stats hashrefs. sub stats_for_date ($stats_dir, $date) { my @stats; for my $proto (qw(gemini web)) { for my $path (<$stats_dir/${proto}_${date}.*.json.gz>) { printf "Reading %s\n", $path; push @stats, FileHelper::read_json_gz($path); @{ $stats[-1] }{qw(proto path)} = ($proto, $path); } } return @stats; } } # Package: Foostats::Reporter — build gemtext/HTML daily and summary reports # - Purpose: Render daily reports and rolling summaries (30/365), and index pages. package Foostats::Reporter { use Time::Piece; use HTML::Entities qw(encode_entities); our @TRUNCATED_URL_MAPPINGS; sub reset_truncated_url_mappings { @TRUNCATED_URL_MAPPINGS = (); } sub _record_truncated_url_mapping { my ($truncated, $original) = @_; push @TRUNCATED_URL_MAPPINGS, { truncated => $truncated, original => $original }; } sub _lookup_full_url_for { my ($candidate) = @_; for my $idx (0 .. $#TRUNCATED_URL_MAPPINGS) { my $entry = $TRUNCATED_URL_MAPPINGS[$idx]; next unless $entry->{truncated} eq $candidate; my $original = $entry->{original}; splice @TRUNCATED_URL_MAPPINGS, $idx, 1; return $original; } return undef; } # Sub: truncate_url # - Purpose: Middle-ellipsize long URLs to fit within a target length. # - Params: $url (str), $max_length (int default 100). # - Return: possibly truncated string. sub truncate_url { my ($url, $max_length) = @_; $max_length //= 100; # Default to 100 characters return $url if length($url) <= $max_length; # Calculate how many characters we need to remove my $ellipsis = '...'; my $ellipsis_length = length($ellipsis); my $available_length = $max_length - $ellipsis_length; # Split available length between start and end, favoring the end my $keep_start = int($available_length * 0.4); # 40% for start my $keep_end = $available_length - $keep_start; # 60% for end my $start = substr($url, 0, $keep_start); my $end = substr($url, -$keep_end); return $start . $ellipsis . $end; } # Sub: truncate_urls_for_table # - Purpose: Truncate URL cells in-place to fit target table width. # - Params: $url_rows (arrayref of [url,count]), $count_column_header (str). # - Return: undef; mutates $url_rows. sub truncate_urls_for_table { my ($url_rows, $count_column_header) = @_; # Calculate the maximum width needed for the count column my $max_count_width = length($count_column_header); for my $row (@$url_rows) { my $count_width = length($row->[1]); $max_count_width = $count_width if $count_width > $max_count_width; } # Row format: "| URL... | count |" with padding # Calculate: "| " (2) + URL + " | " (3) + count_with_padding + " |" (2) my $max_url_length = 100 - 7 - $max_count_width; $max_url_length = 70 if $max_url_length > 70; # Cap at reasonable length # Truncate URLs in place for my $row (@$url_rows) { my $original = $row->[0]; my $truncated = truncate_url($original, $max_url_length); if ($truncated ne $original) { _record_truncated_url_mapping($truncated, $original); } $row->[0] = $truncated; } } # Sub: format_table # - Purpose: Render a simple monospace table from headers and rows. # - Params: $headers (arrayref), $rows (arrayref of arrayrefs). # - Return: string with lines separated by \n. sub format_table { my ($headers, $rows) = @_; my @widths; for my $col (0 .. $#{$headers}) { my $max_width = length($headers->[$col]); for my $row (@$rows) { my $len = length($row->[$col]); $max_width = $len if $len > $max_width; } push @widths, $max_width; } my $header_line = '|'; my $separator_line = '|'; for my $col (0 .. $#{$headers}) { $header_line .= sprintf(" %-*s |", $widths[$col], $headers->[$col]); $separator_line .= '-' x ($widths[$col] + 2) . '|'; } my @table_lines; push @table_lines, $separator_line; # Add top terminator push @table_lines, $header_line; push @table_lines, $separator_line; for my $row (@$rows) { my $row_line = '|'; for my $col (0 .. $#{$row}) { $row_line .= sprintf(" %-*s |", $widths[$col], $row->[$col]); } push @table_lines, $row_line; } push @table_lines, $separator_line; # Add bottom terminator return join("\n", @table_lines); } # Convert gemtext to HTML # Sub: gemtext_to_html # - Purpose: Convert a subset of Gemtext to compact HTML, incl. code blocks and lists. # - Params: $content (str) Gemtext. # - Return: HTML string (fragment). sub gemtext_to_html { my ($content) = @_; my $html = ""; my @lines = split /\n/, $content; my $i = 0; while ($i < @lines) { my $line = $lines[$i]; if ($line =~ /^```/) { my @block_lines; $i++; # Move past the opening ``` while ($i < @lines && $lines[$i] !~ /^```/) { push @block_lines, $lines[$i]; $i++; } $html .= _gemtext_to_html_code_block(\@block_lines); } elsif ($line =~ /^### /) { $html .= _gemtext_to_html_heading($line); } elsif ($line =~ /^## /) { $html .= _gemtext_to_html_heading($line); } elsif ($line =~ /^# /) { $html .= _gemtext_to_html_heading($line); } elsif ($line =~ /^=> /) { $html .= _gemtext_to_html_link($line); } elsif ($line =~ /^\* /) { my @list_items; while ($i < @lines && $lines[$i] =~ /^\* /) { push @list_items, $lines[$i]; $i++; } $html .= _gemtext_to_html_list(\@list_items); $i--; # Decrement to re-evaluate the current line in the outer loop } elsif ($line !~ /^\s*$/) { $html .= _gemtext_to_html_paragraph($line); } # Else, it's a blank line, which we skip for compact output. $i++; } return $html; } sub _gemtext_to_html_code_block { my ($lines) = @_; if (is_ascii_table($lines)) { return convert_ascii_table_to_html($lines); } else { my $html = "
\n";
for my $code_line (@$lines) {
$html .= encode_entities($code_line) . "\n";
}
$html .= "\n";
return $html;
}
}
sub _gemtext_to_html_heading {
my ($line) = @_;
if ($line =~ /^### (.*)/) {
return "" . encode_entities($text) . "
\n"; } return ''; } sub _gemtext_to_html_list { my ($lines) = @_; my $html = "" . linkify_text($line) . "
\n"; } # Check if the lines form an ASCII table # Sub: is_ascii_table # - Purpose: Heuristically detect if a code block is an ASCII table. # - Params: $lines (arrayref of strings). # - Return: 1 if likely table; 0 otherwise. sub is_ascii_table { my ($lines) = @_; return 0 if @$lines < 3; # Need at least header, separator, and one data row # Check for separator lines with dashes and pipes for my $line (@$lines) { return 1 if $line =~ /^\|?[\s\-]+\|/; } return 0; } # Convert ASCII table to HTML table # Sub: convert_ascii_table_to_html # - Purpose: Convert simple ASCII table lines to an HTML