From f3a6a7f119c16d07e6cdb233fc9fe9bff9ad1f8c Mon Sep 17 00:00:00 2001 From: Matt Rude Date: Sat, 25 Jun 2011 22:43:51 -0500 Subject: Adding NTP Clients status --- scripts/ntp_clients_stats | 601 ++++++++++++++++++++++++++++++++++++++++++++++ scripts/ntpclientsd | 477 ++++++++++++++++++++++++++++++++++++ 2 files changed, 1078 insertions(+) create mode 100755 scripts/ntp_clients_stats create mode 100755 scripts/ntpclientsd (limited to 'scripts') diff --git a/scripts/ntp_clients_stats b/scripts/ntp_clients_stats new file mode 100755 index 0000000..1fe3b77 --- /dev/null +++ b/scripts/ntp_clients_stats @@ -0,0 +1,601 @@ +#!/usr/bin/perl -w + +# This script is public domain, there is no copyright on it. +# - Wayne Schlitt + +use strict; + +use Getopt::Long; +use POSIX qw(strftime); +use Fcntl ':flock'; + +my $HELP = 0; +my $STARTFILE = "/var/log/ntpstats/ntp_stats.dump"; + +my $result = GetOptions('help' => \$HELP, + 'startfile=s' => \$STARTFILE, + ); + +if ($HELP || !$result) { + print "Usage: ntp_clients_stats [options]\n"; + print "\n"; + print " -help Help on the options.\n"; + print "\n"; + print " -startfile=/path/dumpfile File to read initial state\n"; + + exit(0); +} + +# pre-allocate the hashes, just to make things a little faster + +my (%count, %rate, %first_tstamp, %last_tstamp, %last_printed, %is_active, %is_abusive); + +my $dump_magic = "ntp_stats"; +my $dump_version = 1; + +my $dump_created = 0; +my $dump_written = 0; +my $dump_age = 0; + +my $total_count = 0; +my $active_count = 0; +my $cur_rate = 0; +my $est_cur_rate = 0; +my $lterm_rate = 0; +my $est_lterm_rate = 0; +my $cur_freq = 0; +my $not_counted = 0; +my $num_clients = 0; +my $num_active = 0; +my $num_abusive = 0; +my $last_cleaned = 0; +my $tdiff; + +my ($ip, $key, $value, $sum, $rank, $max_dist); +my $stars = "*" x 80; + + +# +# This sub tries to determine if the client is active or not. Since +# NTP is a stateless and connectionless (UDP based) protocol, we can +# never know for sure. +# +# Any client that has polled recently is assumed to be active, +# although this problably overcounts the active clients because many +# clients, such as those from ntpdate, only give a short burst of +# traffic. +# +# A client that polls once per 2^14 sec (they exist) can have a very low +# count. If they use iburst, they could have a count of only 4 or so. +# if they don't use iburst, they could have a count of 1. Hence, any +# connection with a small count *may* be active. +# +# By testing this function against live data, I can see that it both +# sometimes counts clients as inactive when they are active, and counts +# clients as still being active for quite a while after they become inactive. +# Overall, it seems to do a pretty good job and the overcounts/undercounts +# are not too common and pretty much cancel each other out. I suspect that +# there is a slight bias toward overcounting. +# +sub calc_is_active { + my $ip = $_[0]; + my $age = $dump_written - $last_tstamp{$ip}; + + if ( ($count{$ip} > 5 || $rate{$ip} > 256) && $age / 5. < $rate{$ip} ) { + # client has poll enough to show a pattern and has miss fewer + # than 4 poll intervals. (Or, is rapidly increasing the poll) + return 1; + } elsif ( $count{$ip} > 1 && $rate{$ip} < 4 && $age < 1024 + 60) { + # client may well have used iburst, which will give misleading initial + # rate values, but it isn't old enough to toss yet. + return 1; + } elsif ( $count{$ip} > 1 && $rate{$ip} > 64 && $age < 300 ) { + # I'm not sure what kind of clients these are, but they show up. + return 1; + } elsif ( $age < 60) { + # client just polled, assume it is active. + return 1; + } else { + # assume client has stopped polling. + return 0; + } +} + +# quickly read in the dump data + +my ($magic, $ver); +my $t0 = time(); +while ( 1 ) { + open(DUMP, "<", $STARTFILE ) or die "Could not open startfile: $STARTFILE"; + flock(DUMP,LOCK_SH); + + ($magic, $ver, $total_count, $dump_created, $dump_written, $cur_rate, $lterm_rate) = split(' ', ); + + # make sure we don't process a half-written dump (does this still happen??) + if ( !defined($magic) || $magic eq "" || !defined($dump_written) ) { + flock(DUMP,LOCK_UN); + close(DUMP); + select(undef, undef, undef, .5); + next; + } + + die "Timed out trying to get valid dump data from $STARTFILE" if ( time() - $t0 >= 15 ); + die "$STARTFILE is not a dump file created by ntp_clients" if ( !defined( $magic ) || $magic ne $dump_magic ); + die "Incorrect dump file version: $ver" if ( !defined( $ver ) || ($ver ne "1" && $ver ne "2") ); + + $tdiff = time() - $dump_written; + last if ( ($tdiff > 0 && $tdiff < 59) + || $tdiff < -1 || $tdiff > 70 ); + flock(DUMP,LOCK_UN); + close(DUMP); + + if ( $tdiff >= 60 ) { + select(undef, undef, undef, .5); + } else { + select(undef, undef, undef, 2 - $tdiff); + } +} + +my @dump = ; +flock(DUMP,LOCK_UN); +close(DUMP); + + +# process the data + +$cur_rate = -1 if ($ver eq "1"); +$lterm_rate = -1 if ($ver eq "1"); + +keys( %count ) = $#dump; +keys( %rate ) = $#dump; +keys( %first_tstamp ) = $#dump; +keys( %last_tstamp ) = $#dump; +keys( %last_printed ) = $#dump; +keys( %is_active ) = $#dump; +keys( %is_abusive ) = $#dump; + +$dump_age = $dump_written - $dump_created; +if ( $total_count > 2 ) { + $est_lterm_rate = $dump_age / ($total_count - 1); +} else { + $est_lterm_rate = 99999; +} + +foreach( @dump ) { + my ($key, $r_count, $r_rate, $r_first_tstamp, $r_last_tstamp) = split; + $count{$key} = $r_count; + $rate{$key} = $r_rate; + $first_tstamp{$key} = $dump_written - $r_first_tstamp; + $last_tstamp{$key} = $dump_written - $r_last_tstamp; + $last_printed{$key} = 0; +} + +while (($ip, $value) = each %last_tstamp) { + + $is_active{$ip} = calc_is_active($ip); + if ( $is_active{$ip} ) { + $num_active++; + $active_count += $count{$ip}; + } + + # If we have a version 1 dump file, try estimating the current bandwidth + # this estimate tends to lag the info from ver 2 dumps, and can be + # off by quite a bit, especially when there isn't much data yet. + if ($rate{$ip} > 0 && $count{$ip} > 5 ) { + my $period = 15*60; + + my $t1 = $first_tstamp{$ip}; + if ( $dump_written - $first_tstamp{$ip} > $period ) { + $t1 = $dump_written - $period; + } + + my $t0 = $last_tstamp{$ip}; + if ( $dump_written - $last_tstamp{$ip} > $period ) { + $t0 = $dump_written - $period; + } + + $cur_freq += (($t0 - $t1)/$period) / $rate{$ip}; + + } else { + $not_counted++; + } + + $num_clients++; + + if ( $count{$ip} < 100 + || ($last_tstamp{$ip} - $first_tstamp{$ip}) / ($count{$ip} - 1) > 30 + || !$is_active{$ip} + ) { + $is_abusive{$ip} = 0; + } else { + $num_abusive++; + $is_abusive{$ip} = 1; + } +} + + +# assume clients that couldn't be counted (new?) are like the rest +#$cur_freq += $not_counted * $lterm_freq / $num_clients; +# fudge for clients that couldn't be counted (ntpdate? new?) +$cur_freq += $not_counted / 2048; +$est_cur_rate = 1/$cur_freq; +$est_cur_rate = $est_lterm_rate if ( $dump_age < 6*1024 + 60 ); + + +# print the report + +printf( "Estimated active ntp pool clients: %6d\n", $num_active); +printf( "Estimated abusive ntp pool clients: %6d\n", $num_abusive); +printf( "Estimated inactive ntp pool clients: %6d\n", $num_clients - $num_active); +printf( "Total ntp pool clients being tracked: %6d\n", $num_clients); +printf( "Note: NTP is a stateless and connectionless (UDP based) protocol, so\n" ); +printf( " exact numbers can't be determined.\n\n" ); + +if ( $dump_age <= 2048 ) { + # need at least two polls from the 1024 before we have good data + printf( "** Warning: this dump file is very new and many results will be misleading. **\n" ); +} elsif ( $dump_age <= 4096 ) { + # this is barely long enough to detect most abusive clients (30s * 100req) + # and most of the graphs are still pretty useless. + printf( "Note: this dump file is too new to give very accurate results.\n" ); +} + +printf( "%9d ntp requests, in total, have been seen since %s\n", + $total_count, strftime( "%D %T", localtime( $dump_created ) ) ); +printf( "%9d (%4.1f%%) are from clients that are still active.\n", + $active_count, 100 * ($active_count / $total_count) ) if ($total_count > 0 ); + +sub print_rates { + my ($hdr, $rate) = @_; + printf( "%s request rate: %.3f seconds between requests (%.2f req/sec)\n", + $hdr, $rate, 1/$rate ); + printf( "%s bandwidth in: %6.3f KBytes/s %7.3f Kbits/s\n", + $hdr, 76./(1024*$rate), 76*8./(1024*$rate) ); + printf( "%s bandwidth in: %6.3f GB/month %7.3f Gb/month\n", + $hdr, 30*24*60*60*76./(1024*1024*1024*$rate), 30*24*60*60*76*8./(1024*1024*1024*$rate) ); + } + +if ( $dump_age > 12*60*60 ) { + if ( $lterm_rate > 0 ) { + print_rates( "Long term", $lterm_rate ); + } elsif ( $est_lterm_rate > 0 ) { + print_rates( "Est Long term", $est_lterm_rate ); + } +} + +if ( $cur_rate > 0 ) { + print_rates( "Current", $cur_rate ); +} elsif ( $est_cur_rate > 0 ) { + print_rates( "Est current", $est_cur_rate ); +} + +printf( "(NTP packets are usually 76 bytes, UDP overhead included, in each direction.)\n" ); + +print "\n"; +printf( "The dump file was written %d seconds ago, at %s\n", + $tdiff, strftime( "%D %T", localtime( $dump_written ) ) ); + +exit if ($total_count < 1 ); + + +print "\n"; +print "Subnets with many clients:\n"; +my (%class_c, %class_c_rate, %class_c_count, %class_c_abusive); +keys( %class_c ) = $num_clients; +foreach $ip (keys( %last_tstamp )) { + my $key = $ip; + next if ( ! $is_active{$ip} ); + $key =~ s/^([0-9.]*)\.[0-9][0-9]*$/$1/; + $class_c{$key}++; + $class_c_rate{$key} += 1./$rate{$ip} if ($rate{$ip} > 0); + $class_c_count{$key} += $count{$ip} if ($count{$ip} > 0); + $class_c_abusive{$key} += $is_abusive{$ip}; +} +printf "# of Subnet Total Aggregate Abusive\n"; +printf " IPs Count Rate Clients\n"; + +foreach $key (sort { $class_c{$b} <=> $class_c{$a} } keys %class_c) { + last if ( $class_c{$key} < 4 ); + printf "%4d %-13s", $class_c{$key}, $key . ".x"; + if ( defined( $class_c_count{$key} ) ) { + printf " %8d", $class_c_count{$key}; + } + if ( defined( $class_c_rate{$key} ) ) { + printf " %9.3f", 1./$class_c_rate{$key}; + } + if ( defined( $class_c_abusive{$key} ) ) { + printf " %7d", $class_c_abusive{$key}; + } + printf "\n"; +} + + +print "\n"; +print "Clients with rapid updates (min requests of 100):\n"; +my @bad_clients; +while (($ip, $value) = each %last_tstamp) { + my $is_rapid = $count{$ip} >= 100 && $rate{$ip} <= 60; + +# printf( "%-15s count: %6d delta: %10.3f rate: %7.2f active: %d\n", +# $ip, $count{$ip}, ($last_tstamp{$ip} - $first_tstamp{$ip}), +# $rate{$ip}, $is_active{$ip} ) if ($is_abusive{$ip} && !$is_rapid ); + + next if (!$is_rapid && !$is_abusive{$ip} ); + + push @bad_clients, $ip +} +print "Rank First Seen Client IP Requests Rate Usage Cumulative\n"; +$sum = 0; +$rank = 0; +foreach $ip (sort { $count{$b} <=> $count{$a} } @bad_clients) { + $rank++; + if ( $is_active{$ip} ) { + $sum += $count{$ip}; + + printf( "%3d %s %-15s %7d %8.2f %5.2f%% %5.2f%% *", + $rank, strftime( "%D %T", localtime( $first_tstamp{$ip} ) ), + $ip, $count{$ip}, $rate{$ip}, + 100 * $count{$ip} / $active_count, 100 * $sum / $active_count + ); + } else { + printf( "%3d %s %-15s %7d %8.2f (%5.2f%%)", + $rank, strftime( "%D %T", localtime( $first_tstamp{$ip} ) ), + $ip, $count{$ip}, $rate{$ip}, + 100 * $count{$ip} / ($active_count + $count{$ip} ) + ); + } + print " !" if ($is_abusive{$ip}); + print "\n"; + last if ( $rank >= 100 ); +} +print "* = \"active\" = probably will send another request.\n"; +print "! = \"abusive\" = min requests of 100 and an average rate of less than 30s\n"; +print " between requests over the life of the entire connection.\n"; +print "Percentages are based on all packets from *active* clients. If the client\n"; +print "is not active, its percentage is what it would be if it were active.\n"; + + +print "\n\n"; +print "Clients with very long updates (min requests of 10):\n"; +my @great_clients; +while (($ip, $value) = each %last_tstamp) { + next if ( $count{$ip} < 10 || $rate{$ip} < 3000 ); + + push @great_clients, $ip +} +print "Rank First Seen Client IP Requests Rate Usage Cumulative\n"; +$sum = 0; +$rank = 0; +foreach $ip (sort { $rate{$b} <=> $rate{$a} } @great_clients) { + $rank++; + + if ( $is_active{$ip} ) { + $sum += $count{$ip}; + + printf( "%3d %s %-15s %7d %8.2f %5.2f%% %5.2f%% *", + $rank, strftime( "%D %T", localtime( $first_tstamp{$ip} ) ), + $ip, $count{$ip}, $rate{$ip}, + 100 * $count{$ip} / $active_count, 100 * $sum / $active_count + ); + } else { + printf( "%3d %s %-15s %7d %8.2f (%5.2f%%)", + $rank, strftime( "%D %T", localtime( $first_tstamp{$ip} ) ), + $ip, $count{$ip}, $rate{$ip}, + 100 * $count{$ip} / ($active_count + $count{$ip} ) + ); + } + print "\n"; + last if ( $rank >= 10 ); +} + +my @client_dist; +my ($log_idx, $idx); +my $base = log( sqrt(2) ); + + +@client_dist = (); +print "\n"; +print "Clients distribution by count:\n"; +while (($ip, $value) = each %last_tstamp) { + + if ( $count{$ip} <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $count{$ip} )/$base; + } + $idx = int( $log_idx + 0.5 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + if ( int( $low ) == int( $high ) ) { + printf( "Error: value in zero range bin!: %d %7.1f - %7.1f\n", $idx, $low, $high ) if ( $client_dist[$idx] != 0 ); + next; + } + $low++; + if ( int( $low ) == int( $high ) ) { + $center = $high; + } + printf( "%7d (%7d - %7d) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} + + +@client_dist = (); +print "\n"; +print "Clients distribution by rate (min requests of 10): \n"; +while (($ip, $value) = each %last_tstamp) { + next if ( $count{$ip} < 10 ); + + if ( $rate{$ip} <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $rate{$ip} )/$base; + } + $idx = int( $log_idx + 0.5 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + printf( "%7.1f (%7.1f - %7.1f) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} + + +@client_dist = (); +print "\n"; +print "Clients distribution by amount of time client was active (in hours):\n"; +while (($ip, $value) = each %last_tstamp) { + + my $age = ($last_tstamp{$ip} - $first_tstamp{$ip})/(60*60); + + if ( $age <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $age )/$base; + } + $idx = int( $log_idx + 0.5 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + printf( "%7.1f (%7.1f - %7.1f) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} + + +@client_dist = (); +print "\n"; +print "Clients distribution by time since first request (in hours):\n"; +while (($ip, $value) = each %last_tstamp) { + + my $age = ($dump_written - $first_tstamp{$ip})/(60*60); + +# printf( "%-15s %s %s %.2f\n", +# $ip, +# strftime( "%D %T", localtime( $first_tstamp{$ip} ) ), +# strftime( "%D %T", localtime( $last_tstamp{$ip} ) ), +# $age ); + + if ( $age <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $age )/$base; + } + $idx = int( $log_idx + 0.5 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + printf( "%7.1f (%7.1f - %7.1f) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} + + +@client_dist = (); +print "\n"; +print "Active clients distribution by time since last request (in secs):\n"; +while (($ip, $value) = each %last_tstamp) { + + next if ( ! $is_active{$ip} ); + + my $age = ($dump_written - $last_tstamp{$ip}); + + if ( $age <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $age )/$base; + } + $idx = int( $log_idx + 0.5 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + printf( "%7.1f (%7.1f - %7.1f) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} + + +@client_dist = (); +print "\n"; +print "Inactive clients distribution by time since last request (in secs):\n"; +while (($ip, $value) = each %last_tstamp) { + + next if ( $is_active{$ip} ); + + my $age = ($dump_written - $last_tstamp{$ip}); + + if ( $age <= 1 ) { + $log_idx = 0; + } else { + $log_idx = log( $age )/$base; + } + $idx = int( $log_idx + 0.5 ); + printf( "%-15s inactive %5d %7.2f > %7.2f\n", $ip, $count{$ip}, ($dump_written - $last_tstamp{$ip}) / 10.,$rate{$ip} ) if ( $idx < 10 ); + $client_dist[$idx]++; +} +$max_dist = 0; +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $max_dist = $client_dist[$idx] if ( defined $client_dist[$idx] && $max_dist < $client_dist[$idx] ); +} +print " bin ( bin range ) count\n"; + +for( $idx = 0; $idx <= $#client_dist; $idx++ ) { + $client_dist[$idx] = 0 if ( ! defined( $client_dist[$idx] ) ); + my $low = $idx ? 2**($idx/2 - .25) : 0; + my $high = 2**($idx/2 + .25); + my $center = 2**($idx/2); + my $num_star = 45 * ($client_dist[$idx] / $max_dist); + printf( "%7.1f (%7.1f - %7.1f) %5d %.*s\n", + $center, $low, $high, $client_dist[$idx], $num_star, $stars ); +} diff --git a/scripts/ntpclientsd b/scripts/ntpclientsd new file mode 100755 index 0000000..17dd991 --- /dev/null +++ b/scripts/ntpclientsd @@ -0,0 +1,477 @@ +#!/usr/bin/perl -w + +# This script is public domain, there is no copyright on it. +# - Wayne Schlitt + + +use strict; + +use Getopt::Long; +use POSIX qw(strftime); +use Fcntl ':flock'; + +#my $TCPDUMP_PROG = "/tmp/tcpdump-3.7.2/tcpdump"; +#my $TCPDUMP_PROG = "/tmp/tcpdump-3.9.1/tcpdump"; +my $TCPDUMP_PROG = "tcpdump"; + + +my $HELP = 0; +my $DUMPFILE = ""; +my $STARTFILE = ""; +my $VERBOSE = 1; +my $INTERFACE = ""; +my $TCPDUMP_VER = undef; + +my $result = GetOptions('help' => \$HELP, + 'dumpfile=s' => \$DUMPFILE, + 'startfile=s' => \$STARTFILE, + 'interface=s' => \$INTERFACE, + 'verbose+' => \$VERBOSE, + 'tcpdump=f' => \$TCPDUMP_VER, + 'quiet' => sub { $VERBOSE = 0 }, + ); + +if ($HELP || !$result) { + print "Usage: ntp_clients [options]\n"; + print "\n"; + print " -help Help on the options.\n"; + print "\n"; + print " -dumpfile=/path/dumpfile File to read/write internal state\n"; + print " -startfile=/path/dumpfile File to read initial state\n"; + print " -interface=eth# Lan interface to monitor\n"; + print " -verbose increase amount of information printed\n"; + print " by default, track only bad clients\n"; + print " -v will give you all clients\n"; + print " -v -v will give you all requests\n"; + print " -tcpdump=version set format of NTP trace output"; + print " -quiet don't print any info\n"; + + exit(0); +} + +$| = 1; + +# pre-allocate the hashes, just to make things a little faster + +my (%count, %rate, %first_tstamp, %last_tstamp, %last_printed); +keys( %count ) = 1024; +keys( %rate ) = 1024; +keys( %first_tstamp ) = 1024; +keys( %last_tstamp ) = 1024; +keys( %last_printed ) = 1024; + +my $dump_magic = "ntp_stats"; +my $dump_version = 2; +my $dump_created = time(); +my $dump_written = 0; +my $dump_age = 0; + +my $total_count = 0; +my $base_rate = 99999; +my $cur_period = 15*60; +my $cur_rate = -1; +my $cur_factor1 = 1; +my $cur_factor2 = 0; +my $lterm_period = 15*24*60*60; +my $lterm_rate = -1; +my $lterm_factor1 = 1; +my $lterm_factor2 = 0; +my $prev_tstamp = undef; +my $delta = undef; +my $num_clients = 0; +my $last_cleaned = 0; +my $last_mday = (localtime(time))[3]; + +my ($tstamp, $ip_str, $ip, $tofrom, $ip2, $ver, $s_or_c, $s_or_c2, $s_or_c3 ); + +$tstamp = $prev_tstamp = time(); + + +$STARTFILE = $DUMPFILE if ( !$STARTFILE && $DUMPFILE ); + +if ( $STARTFILE && open(DUMP, "<", $STARTFILE ) ) { + flock(DUMP,LOCK_SH); + + my ($magic, $ver); + ($magic, $ver, $total_count, $dump_created, $dump_written, $cur_rate, $lterm_rate) = split(' ', ); + die "$STARTFILE was not created by ntp_stats" if ( !defined( $magic ) || $magic ne $dump_magic ); + die "Incorrect dump file version: $ver" if ( !defined( $ver ) || ($ver ne "1" && $ver ne "2") ); + if (time() - $dump_written > 2*60*60 ) { + # If the dumpfile is too old, it will really screw up the stats + # and it will take hours for them to drift back to what they should be. + # Might as well just start over. + print "Warning: start file too old to be used.\n"; + $total_count = 0; + $base_rate = 99999; + $cur_rate = -1; + $lterm_rate = -1; + $dump_created = time(); + $dump_written = 0; + $dump_age = 0; + } else { + $cur_rate = -1 if ($ver eq "1"); + $lterm_rate = -1 if ($ver eq "1"); + $dump_age = $dump_written - $dump_created; + if ( $total_count > 2 ) { + $base_rate = $dump_age / ($total_count - 1); + } else { + $base_rate = 99999; + } + $lterm_rate = $base_rate if ( $lterm_rate > $base_rate * 10 ); + $cur_rate = $lterm_rate if ( $cur_rate > $lterm_rate * 10 ); + + while() { + chomp; + my ($key, $r_count, $r_rate, $r_first_tstamp, $r_last_tstamp, $r_last_printed) = split; + $count{$key} = $r_count; + $rate{$key} = $r_rate; + $first_tstamp{$key} = $dump_written - $r_first_tstamp; + $last_tstamp{$key} = $dump_written - $r_last_tstamp; + $last_printed{$key} = 0; + $num_clients++; + } + } + flock(DUMP,LOCK_UN); + close(DUMP); +} + + +my $tcpdump_major = -1; +my $tcpdump_minor = -1; +my $tcpdump_patch = -1; +if ( ! defined( $TCPDUMP_VER ) ) { + open(PROG, "$TCPDUMP_PROG -V 2>&1 |") or die "Can't run tcpdump: $!"; + $TCPDUMP_VER = ; + close(PROG); + die "Could not determine tcpdump version" if ( !defined( $TCPDUMP_VER ) ); + $TCPDUMP_VER =~ s/^tcpdump version ([.0-9]*).*\n$/$1/; + + # Apparently, the 3.8.2 release is screwed up. Straight from + # tcpdump.org, the tarball for v3.8.2 has stuff that is marked as being + # in the "v3.9 branch", while the v3.8.3 release has the older stuff, + # from the "v3.8 branch". To make matters worse, it appears that at + # least the RedHat Fedora-Core 3 release of tcpdump v3.8.2 claims to + # be just version "3.8". + + if ( $TCPDUMP_VER eq "3.8.2" || $TCPDUMP_VER eq "3.8" ) { + $TCPDUMP_VER = "3.9"; + } +} + + +($tcpdump_major = $TCPDUMP_VER) =~ s/^([0-9][0-9]*).*/$1/; +($tcpdump_minor = $TCPDUMP_VER) =~ s/^[0-9][0-9]*\.([0-9][0-9]*).*/$1/; +($tcpdump_patch = $TCPDUMP_VER) =~ s/^[0-9][0-9]*\.[0-9][0-9]*\.([0-9][0-9]*.*)/$1/; +$tcpdump_minor = 0 if ($tcpdump_minor eq $TCPDUMP_VER); +$tcpdump_patch = 0 if ($tcpdump_patch eq $TCPDUMP_VER); +print "Warning: untested tcpdump version: $TCPDUMP_VER\n" + if ( $tcpdump_major != 3 || $tcpdump_minor < 4 || $tcpdump_minor > 9 ); + + +print "tcpdump version: $TCPDUMP_VER $tcpdump_major $tcpdump_minor $tcpdump_patch\n" if ($VERBOSE > 2); + +# get the data +my $iface = ""; +$iface = "-i $INTERFACE" if ( $INTERFACE ne "" ); +open(PROG, "$TCPDUMP_PROG -n -tt -p $iface port 123 2>/dev/null |") or die "Can't run tcpdump: $!"; + +# process the data + +if ( $VERBOSE == 1 && (time() - $dump_created < 10*60 || $total_count <= 25 * $num_clients) ) { + print "Collecting data... May take up to 100 minutes to display bad clients.\n\n"; +} + +if ( $VERBOSE ) { + printf " Time Total Num Client Client Delta Rate\n"; + printf " Requests Clients IP Requests (sec) (sec)\n"; +} + +while() { + + if ( $tcpdump_major > 3 || ($tcpdump_major == 3 && $tcpdump_minor >= 9 ) ) { + ($tstamp, $ip_str, $ip, $tofrom, $ip2, $ver, $s_or_c, $s_or_c2, $s_or_c3 ) = split; +# print("$_"); +# printf( "tstamp: %s\n", $tstamp ); +# printf( "ip_literal: %s\n", $ip_str ); +# printf( "ip: %s\n", $ip ); +# printf( "tofrom: %s\n", $tofrom ); +# printf( "ip2: %s\n", $ip2 ); +# printf( "ver: %s\n", $ver ); +# printf( "s_or_c: %s\n", $s_or_c ); + + # I'm getting a few packets with incorrect NTP versions + if ( $ver !~ "NTPv" ) { + print("wrong protocol: $_" ) if ( $VERBOSE > 1 ); + next; + } + + if ( $ver !~ "NTPv[1-4]" ) { + print("unsupported NTP version: $_" ) if ( $VERBOSE > 1 ); + next; + } + + # sanity check the rest of the data + + if ($s_or_c eq "+1s" || $s_or_c eq "-1s" ) { + $s_or_c = $s_or_c2; + $s_or_c2 = $s_or_c3; + $s_or_c3 = undef; + } + + if ( !defined($s_or_c) ) { + print( "The server/client value is missing from the tcpdump\n" ); + print( "output: $_\n" ); + next; + } + + $s_or_c =~ s/,$//; + $s_or_c = lc($s_or_c); + if ( $s_or_c eq "symmetric" && defined( $s_or_c2 ) ) { + $s_or_c2 = lc($s_or_c2); + $s_or_c = "sym_act" if ( $s_or_c2 eq "active," ); + $s_or_c = "sym_pas" if ( $s_or_c2 eq "passive," ); + $s_or_c2 = $s_or_c3; + $s_or_c3 = undef; + } + + $s_or_c = "res1" if ( $s_or_c eq "reserved" ); + $s_or_c = "unspec" if ( $s_or_c eq "unspecified" ); + $s_or_c = "bcast" if ( $s_or_c eq "broadcast" ); + + + if ( defined($s_or_c2) && $s_or_c2 eq "length" + && defined($s_or_c3) && $s_or_c3 ne "48" ) { + print("Warning: wrong length NTP packet: $_" ) if ( $VERBOSE > 2 ); + } + + } elsif ( $tcpdump_major == 3 && $tcpdump_minor == 8 ) { + ($tstamp, $ip_str, $ip, $tofrom, $ip2, $ver, $s_or_c, $s_or_c2 ) = split; +# print("$_"); +# printf( "tstamp: %s\n", $tstamp ); +# printf( "ip_literal: %s\n", $ip_str ); +# printf( "ip: %s\n", $ip ); +# printf( "tofrom: %s\n", $tofrom ); +# printf( "ip2: %s\n", $ip2 ); +# printf( "ver: %s\n", $ver ); +# printf( "s_or_c: %s\n", $s_or_c ); + + # I'm getting a few packets with incorrect NTP versions + if ( $ver !~ "NTPv" ) { + print("wrong protocol: $_" ) if ( $VERBOSE > 1 ); + next; + } + + if ( $ver !~ "NTPv[1-4]" ) { + print("unsupported NTP version: $_" ) if ( $VERBOSE > 1 ); + next; + } + + if ( $ver =~ "^\\[len=[0-9][0-9]*]NTPv[1-4]\$" ) { + print("Warning: wrong length NTP packet: $_" ) if ( $VERBOSE > 2 ); + } + + # sanity check the rest of the data + + if ($s_or_c eq "+1s" || $s_or_c eq "-1s" ) { + $s_or_c = $s_or_c2; + $s_or_c2 = undef; + } + + if ( !defined($s_or_c) ) { + print( "The server/client value is missing from the tcpdump\n" ); + print( "output: $_\n" ); + next; + } + + $s_or_c =~ s/,$//; + } else { + ($tstamp, $ip, $tofrom, $ip2, $ver, $s_or_c, $s_or_c2, $s_or_c3 ) = split; +# print("$_"); +# printf( "tstamp: %s\n", $tstamp ); +# printf( "ip: %s\n", $ip ); +# printf( "tofrom: %s\n", $tofrom ); +# printf( "ip2: %s\n", $ip2 ); +# printf( "ver: %s\n", $ver ); +# printf( "s_or_c: %s\n", $s_or_c ); + + if ( $ver =~ "^\\[len=[0-9][0-9]*]" && $s_or_c =~ "^v[1-4]\$" ) { + print("Warning: wrong length NTP packet: $_" ) if ( $VERBOSE > 2 ); + $ver = $s_or_c; + $s_or_c = $s_or_c2; + $s_or_c2 = $s_or_c3; + $s_or_c3 = undef; + } + + # I'm getting a few packets with incorrect NTP versions + if ( $ver !~ "v" ) { + print("wrong protocol: $_" ) if ( $VERBOSE > 1 ); + next; + } + + if ( $ver !~ "v[1-4]" ) { + print("unsupported NTP version: $_" ) if ( $VERBOSE > 1 ); + next; + } + + # sanity check the rest of the data + + if ($s_or_c eq "+1s" || $s_or_c eq "-1s" ) { + $s_or_c = $s_or_c2; + $s_or_c2 = $s_or_c3; + $s_or_c3 = undef; + } + + if ( !defined($s_or_c) ) { + print( "The server/client value is missing from the tcpdump\n" ); + print( "output: $_\n" ); + next; + } + } + + if ( $VERBOSE > 1 + && $s_or_c ne "server" && $s_or_c ne "client" + && $s_or_c ne "sym_pas" && $s_or_c ne "sym_act" + && $s_or_c ne "res1" && $s_or_c ne "res2" + && $s_or_c ne "unspec" && $s_or_c ne "bcast" + ) { + printf( "Invalid server/client value: %s\n", $s_or_c ); + printf( "tcpdump line: %s\n", $_ ); + } + + # select only packets being sent, not coming to + next if ( $s_or_c ne "client" && $s_or_c ne "sym_act" ); + + + + # clean up the variables, removing port numbers, punctuation, etc. + $ip =~ s/\.[0-9a-z_-]+$//; +# $ip2 =~ s/\.[0-9a-z_-]+:$//; + + + # calculate stats + if ( defined($first_tstamp{$ip}) ) { + $count{$ip}++; + $delta = $tstamp - $last_tstamp{$ip}; + if ( $count{$ip} < 2 ) { + $rate{$ip} = -1; # workaround old bug, stored in the dump + } elsif ( $count{$ip} < 10 ) { + $rate{$ip} = ($tstamp - $first_tstamp{$ip}) / ($count{$ip} - 1); + } else { + # Calculate weighted average. Recent deltas count for more. + # The most recent counts for 5% of the average, the 10th oldest + # counts for 2.9%, the 50th counts for 0.38%, the 100th for 0.03% + # This tries to allow for quick detection of clients changing their + # polling rate while ignoring the effects dropped packets. + $rate{$ip} = $rate{$ip} * 0.95 + $delta * 0.05; + } + $last_tstamp{$ip} = $tstamp; + + } else { + $num_clients++; + $count{$ip} = 1; + $delta = -1; + $rate{$ip} = -1; + $first_tstamp{$ip} = $last_tstamp{$ip} = $tstamp; + $last_printed{$ip} = 0; + } + + $total_count++; + $lterm_rate = $lterm_rate * $lterm_factor1 + + ($tstamp - $prev_tstamp) * $lterm_factor2; + $cur_rate = $cur_rate * $cur_factor1 + + ($tstamp - $prev_tstamp) * $cur_factor2; + $prev_tstamp = $tstamp; + + # remove old entries, once a minute + if ( $tstamp - $last_cleaned > 60 ) { +# print "cleaning...\n"; + $last_cleaned = $tstamp; + $num_clients = 0; + + # recalculate rate factors. They don't change much, but they + # do gradually drift as more clients join the pool. + if ( $total_count > 2 ) { + $base_rate = ($tstamp - $dump_created) / ($total_count - 1); + if ( $base_rate <= $lterm_period ) { + # the most recent 15days gives about 60% of the value of lterm_rate + $lterm_factor2 = $base_rate / $lterm_period; + $lterm_factor1 = 1 - $lterm_factor2; + } + if ( $lterm_rate <= $cur_period ) { + # the most recent 15min gives about 60% of the value of cur_rate + $cur_factor2 = $lterm_rate / $cur_period; + $cur_factor1 = 1 - $cur_factor2; + } + } + $dump_age = $tstamp - $dump_created; + if ( $dump_age < $lterm_period || $lterm_rate > $dump_age + || $lterm_rate <= $base_rate * .1 || $lterm_rate > $base_rate * 10 ) { + $lterm_rate = $base_rate; + } + if ( $dump_age < $cur_period || $lterm_rate > $dump_age + || $cur_rate <= $lterm_rate * .1 || $cur_rate > $lterm_rate * 10 ) { + $cur_rate = $lterm_rate; + } + + + if ( $DUMPFILE ) { + open(DUMP, ">", $DUMPFILE ); + flock(DUMP,LOCK_EX); + printf( DUMP "%s %d %d %d %.3f %g %g\n", + $dump_magic, $dump_version, $total_count, + $dump_created, $tstamp, $cur_rate, $lterm_rate ); + } + + while (my ($key, $value) = each %last_tstamp) { + + if ( $tstamp - $value > 2**14 + 60 ) { # I've seen clients with 14 poll + delete $count{$key}; + delete $rate{$key}; + delete $first_tstamp{$key}; + delete $last_tstamp{$key}; + delete $last_printed{$key}; + } else { + $num_clients++; + printf( DUMP "%-15s %8d %9.3f %12.3f %9.3f\n", + $key, $count{$key}, $rate{$key}, + $tstamp - $first_tstamp{$key}, $tstamp - $last_tstamp{$key} ) + if ( $DUMPFILE ); + } + } + if ( $DUMPFILE ) { + flock(DUMP,LOCK_UN); + close(DUMP); + } + } + + next if ( $VERBOSE == 0 ); + + # keep bad clients from filling the log + if ( $VERBOSE == 1 ) { + my $lprt = $tstamp - $last_printed{$ip}; + next if ( $lprt < 60*60 ); + next if ( $count{$ip} < 100 || $rate{$ip} > 60 ); + } + elsif ( $VERBOSE == 2 ) { + my $age = $tstamp - $first_tstamp{$ip}; + my $lprt = $tstamp - $last_printed{$ip}; + next if ( ($age > $lprt * 3 || $lprt < 2) && $lprt < 500 ); + } + + # print out the results + my @ltime = localtime( int($tstamp) ); + if ( $last_mday != $ltime[3] ) { + printf( " --- Mark: %s ---\n", strftime("%D %T",@ltime) ); + $last_mday = $ltime[3]; + } + my $tstr = strftime("%T", @ltime); + $last_printed{$ip} = $tstamp; + if ( $count{$ip} == 1 ) { + printf( "%s %9d %6d %-15s %8d\n", + $tstr, $total_count, $num_clients, $ip, $count{$ip} ); + } else { + printf( "%s %9d %6d %-15s %8d %+8.1f %8.2f\n", + $tstr, $total_count, $num_clients, $ip, $count{$ip}, + $delta, $rate{$ip} ); + } +} -- cgit v1.2.3-59-ga6da