#!/usr/bin/perl -Tw # $Id: poison 116 2003-04-01 12:21:57Z aqua $ # sugarplum poison CGI # Copyright (c) 1999 by Devin Carraway # Freely distributable under terms of the GPL. # The poison CGI does the actual dealings with spam spiders. # It has the job of giving them something realistic to # peruse, including valid-looking addresses with which to # poison the spammer's database. # For the obvious reason, this script should either be named # innocuously, or else ScriptAliased in such a way as to hide # its name. Pick something suitably random. # See http://www.devin.com/sugarplum/ for the rest. # The user should adjust the configuration options below to # suit their particular whims. Also it may be desirable to # alter the subroutine fake_address() if a particular spam # collection system is in place on the site already. use strict; use Getopt::Long; use Sys::Hostname; use IO::File; my $VERSION = 0.9.10; # American TLDs for use generating random addresses. Non-US TLDs omitted, # on the presumption that it will reduce international bandwidth costs. my @TLDs = ( 'com', 'org', 'net', 'edu', 'jp', 'co.cn' ); my @teergrube_address_fqdn = ( 'spamcatcher.teergrube.net', 'spamyourself.teergrube.net' ); my %opts = ( # log level (0 for no logging besides the webserver's own) loglevel => 0, # log file (if loglevel set true) logfile => '/var/log/sugarplum.log', # Should sugarplum work deterministically? If set true, a given # URL will always seed perl's RNG with a consistent value (sum of # the ordinal values of the bytes in the PATH_INFO variable) -- # consequently the same random values will be picked, so multiple # reloads of any given URL in sugarplum will produce identical # output. Hence if a harvester were to check for poison by # comparing data on subsequent loads, they won't be tipped off. deterministic => 1, # If set, the system hostname will be included in the seed in # deterministic mode. This introduces variance into different # sugarplum installations with identical paths (e.g. /sugarplum/), # thus preventing an attacker from doing hash-based poison # avoidance. This is desirable in all cases other than when # using sugarplum on sites run on clustered servers, where the # hostname may not be consistent across multiple loads of the # same URI. deterministic_by_hostname => 1, # If set, the HTTP server name will be rolled into the seed # in deterministic mode. This yields differing sugarplum # output across different virtualhosts, or indeed across # different servers. A possible drawback is that it may be # possible for an attacker to detect the poison programmatically # on sites using wildcard-matched vhosts. deterministic_by_httphost => 1, # path to the dictionary dictfile => '/usr/share/dict/words', # if true, entries from the above spammer email list will # be included as addresses. See poison_spammer_frequency. use_spammerlist => 0, # path to the known-spammer list, one email address per line. # not needed if use_spammerlist is set false. spammerfile => '/etc/sugarplum/spammers', # if true, randomly generated email addresses will be # included in the output. See poison_address_frequency. use_poison_addresses => 1, # if true, generated addresses will be used based on those # inside your teergrube -- see address_teergrube_frequency. use_teergrube_addresses => 1, # how many paragraphs of poisoned output? poison_paragraphs => 8, # should the background be randomized? (arguably improves # page plausibility, but often looks wonky to humans) poison_random_background => 1, # How deep should the URLs recurse (remember, this is # an exponential factor) poison_maxdepth => 9, # Minimum number of words per paragraph? poison_paragraph_wordcount_base => 50, # Random range of words beyond the base? poison_paragraph_wordcount_range => 75, # Column before which lines in output should have newlines # appended; simulates editor linewrap. Set very high to # disable linewrap. poison_paragraph_linewrap_col => 78, # What percentage of email addresses should be derived from a # dictionary word? (range 0 to 1 inclusive) poison_word_username_frequency => 0.6, # and of those, what percentage should have numbers attached to the # end (as with AOL and similar providers)? (0-1 inclusive) poison_wordnumber_username_frequency => 0.3, # and if so, up to how many digits? poison_wordnumber_username_maxdigits => 4, # The below should add up to 1 in any combination. # poison_spammer_frequency has been renamed -- # poison_address_frequency now specifies the frequency of all # addresses, while address_spammer_frequency (below) denotes the # portion of addresses taken from known spammers. # Percentage of words which should be poisoned addresses (see # below) poison_address_frequency => 0.03, # Percentage of words which should be normal words poison_dictword_frequency => 0.97, # Settings regulating what proportions of address types will be # emitted -- should add up to 1. Any zero value disables the # feature. # percentage of addresses emitted which should be taken from # the list of known spammers (0 to disable) address_spammer_frequency => 0.2, # percentage of addresses emitted which should be generated in # the teergrube (randomuser@teergrube.domain.tld): address_teergrube_frequency => 0.4, # percentage of addresses emitted which should simply be totally # random (0 to disable): address_random_frequency => 0.6, # give a fully-qualified hostname, which will be # used in generating tarpit addresses. teergrube_address_fqdn => 'spamcatcher.teergrube.net', ); sub read_config { my $fn = shift || return undef; my $f = new IO::File($fn) || die "$fn: $!"; my $l = 0; while (<$f>) { $l++; next if /^(\s*#|\s*$)/; chomp; if (/^(\w+):\s+(\S.*)/) { $opts{$1} = $2; } else { die "malformed config '$_' in $fn:$l\n"; } } 1; } sub usage { print "usage: $0\n", "\t-h : help\n", "\t-c | --config : specify config file\n", "output options:\n", map { tr/_/-/; "\t--$_ \n" } sort keys %opts; exit 0; } $ENV{PATH} = '/bin:/usr/bin'; $ENV{IFS} = ' '; delete $ENV{ENV}; delete $ENV{CDPATH}; ## work out a runtime configuration my $config_fn; GetOptions('config=s' => \$config_fn, 'c=s' => \$config_fn, 'h' => \&usage, (map { my $k = $_; $k =~ tr/_/-/; ("$k=s" => \$opts{$_}) } keys %opts)) || &usage; unless ($config_fn) { for ('/etc/sugarplum/config', '/usr/local/etc/sugarplum/config') { if (-e $_) { $config_fn = $_; last } } } $config_fn and &read_config($config_fn); my ($dict,$dict_size,$spammers,$spammers_size); my $uri = $ENV{REQUEST_URI} || 'http://localhost.test/sugarplum/'; $uri .= '/' unless $uri =~ /\/$/; my $depth = !$ENV{PATH_INFO} || ($ENV{PATH_INFO} =~ tr:/:/:); my $okrecurse = ($depth < $opts{poison_maxdepth}); $dict = new IO::File($opts{dictfile}) || die "dictfile $opts{dictfile}: $!"; $dict_size = (stat($dict))[7] || die "dictfile $opts{dictfile} is empty"; if ($opts{use_spammerlist}) { $spammers = new IO::File($opts{spammerfile}) || die "spammerfile $opts{spammerfile}: $!"; $spammers_size = (stat($spammers))[7] || die "spammerfile $opts{spammerfile} is empty"; } print "Content-Type: text/html\n", "Last-Modified: ".scalar localtime(((stat($0))[9]-65536)+ length($ENV{PATH_INFO} || $0)*1280), "\n\n"; if ($opts{deterministic}) { my $seed = 0; if ($opts{deterministic_by_hostname}) { $seed += $_ for (unpack('C*',hostname || '')); } if ($opts{deterministic_by_httphost}) { $seed += $_ for (unpack('C*',$ENV{HTTP_HOST} || $ENV{SERVER_NAME} || '')); } $seed += $_ for (unpack('C*',$ENV{PATH_INFO} || $0)); srand($seed); } ## HTML generation thus commences if (!int rand 2) { if (!int rand 3) { my @dt = ('', ''); print $dt[int rand ($#dt+1)],"\n"; } print int rand 2 ? '' : '',"\n"; } if (!int rand 2) { # sometimes have a head of varying elaboration print "\n"; my $indent = int rand 2 ? "\t" : ' ' x int rand 10; print $indent,¶graph(1 + int rand 10,2,'title'),"\n"; !int rand 3 and print $indent,"\n"; # irony? !int rand 4 and print $indent,"\n"; !int rand 4 and print $indent,"\n"; print "\n"; } elsif (int rand(1)<0.9) { # _almost_ always have a title. :) print ¶graph(1 + int rand 10,2,'title'),"\n"; } $opts{poison_random_background} and !int rand 2 and printf "\n", int rand 0x100,int rand 0x100, int rand 0x100, int rand 0x100,int rand 0x100, int rand 0x100; !int rand 2 and print ¶graph(1 + int rand 8,2,'h1'),"\n\n"; !int rand 3 and $okrecurse and do { my $x; print '

[ '; for (0..1+int rand 8) { $x = &random_word; print "$x | "; } $x = &random_word; print "$x ]

"; }; for (1..$opts{poison_paragraphs}) { print ¶graph($opts{poison_paragraph_wordcount_base} + int rand $opts{poison_paragraph_wordcount_range},int rand 2); } if ($opts{loglevel}) { my $log = new IO::File('>>'.$opts{logfile}) || die "$opts{logfile}: $!"; print $log join("\t",scalar localtime, $ENV{REMOTE_HOST} || 'unknown host', $ENV{REMOTE_ADDR} || 'unknown addr', $ENV{HTTP_USER_AGENT} || 'unknown agent', $uri),"\n"; } exit 0; sub paragraph { my $wordcount = shift; my $format = shift; my ($i,$i1,$x,$word,$capnext); my $buf = ''; my $ll = 0; $wordcount ||= 50 + int rand 75; if (!$format) { # simple words, no formatting $buf = '

'; $capnext = 1; for $i (1..$wordcount) { $word = &word; substr($word,0,1) =~ tr/a-z/A-Z/, $capnext=0 if $capnext; $buf .= $word.".",next if $i==$wordcount; $x = rand 1; if ($x<0.35 || $word =~ tr/$word "; } elsif ($x < 0.50) { # end of sentence $word .= (('.') x 20, '.','.','!','?','?')[int rand 25].' '; $capnext = 1; } elsif ($x < 0.56) { $word .= ', '; } elsif ($x < 0.58) { $word .= ': '; } elsif ($x < 0.60) { $word .= ' -- '; } elsif ($x < 0.97) { $word .= ' '.&conjunction.' '; } else { $word .= '; '; } if ($ll+length($word)>$opts{poison_paragraph_linewrap_col}) { $buf .= "\n"; $ll = length($word); } else { $ll += length($word); } $buf .= $word; } $buf .= "

\n\n"; } elsif ($format == 1) { # unordered list $buf = '
  • '; $capnext = 1; for $i (1..$wordcount) { $word = &word; substr($word,0,1) =~ tr/a-z/A-Z/, $capnext=0 if $capnext; $buf .= $word.".",next if $i==$wordcount; $x = rand 1; if ($x<0.50 || $word =~ tr/\n
  • "; } if ($ll+length($word)>$opts{poison_paragraph_linewrap_col}) { $buf .= "\n"; $ll = length($word); } else { $ll += length($word); } $buf .= $word; } $buf .= "
\n\n"; } elsif ($format == 2) { # heading, e.h. h1; almost no punctuation, no line breaks, short $_[0] and $buf = "<$_[0]>"; $capnext = 1 if rand(1) >= 0.5; for $i (0..$wordcount-1) { $word = &word(1); substr($word,0,1) = uc substr($word,0,1) if $capnext || !$i; $buf .= ' ' if $i; $buf .= $word; } $_[0] and $buf .= ""; } $buf; } sub conjunction { # no, these are not all conjunctions. conjunction() provides an increased frequency # of those parts of speech which occur as primary connectors in English, and should # therefore appear also in convincing poison. my @words = ( 'is', 'was', 'are', 'am', 'be', 'has', 'had', 'having', 'and', 'not', 'nor', 'neither', 'either', 'or', 'will', 'may', 'might','would','could','should','ought', 'do', 'did', 'done', 'doing','does', 'if', 'when', 'after', 'before', 'unless', 'until' ); $words[int rand ($#words + 1)]; } sub random_line { my $fh = shift || return undef; my $size = shift || return undef; my $l; until ($l) { seek($fh, int rand $size, 0) || die "seek: $!"; <$fh>; $l = <$fh>; # re-seek on comment lines if ($l) { chomp $l; $l =~ s/\s*#.*//; } } $l; } sub random_word { &random_line($dict, $dict_size) } sub random_spammer { &random_line($spammers, $spammers_size) } sub word { my ($i1,$x); my $word; my $realword_only = shift; $i1 = rand 1; if (!$realword_only && $i1<$opts{poison_address_frequency}) { $x = &address; $word = "$x"; } else { $word = &random_word; } $word; } sub address { my $n = rand 1; if ($opts{use_spammerlist} && $n<$opts{address_spammer_frequency}) { return &random_spammer; } elsif ($opts{use_teergrube_addresses} && $n<($opts{address_spammer_frequency}+ $opts{address_teergrube_frequency})) { # return &teergrube_username . '@' . $opts{teergrube_address_fqdn}; return &teergrube_username . '@' . $teergrube_address_fqdn[int rand @teergrube_address_fqdn]; } &fake_address; } sub fake_address { my @charset = ( 'a'..'z', 'a'..'z', 'a'..'z', 'A'..'Z', '0'..'9','-','.' ); my ($i,$s); my $addr = &fake_username.'@'; for ($i=0; $i<1+int rand 4; $i++) { $s = &random_word; $s =~ tr/a-z//cd; $addr .= $s; $addr .= '.'; } $addr .= $TLDs[int rand @TLDs]; $addr; } sub fake_username { my $un = ''; my @charset = ( 'a'..'z', 'a'..'z', 'a'..'z', 'A'..'Z', '0'..'9','-','.' ); if (rand 1 <= $opts{poison_word_username_frequency}) { $un = &random_word; $un .= ('a'..'z')[int rand 26] for (length($un)..int rand 14-length($un)); if (rand 1 <= $opts{poison_wordnumber_username_frequency}) { $un .= (0..9)[int rand 10] for (0..int rand $opts{poison_wordnumber_username_maxdigits}); } } else { $un = ('a'..'z')[int rand 26]; for (0..1+int rand 14) { $un .= $charset[int rand($#charset+1)]; } } $un; } # encode $ENV{REMOTE_ADDR} in a reversible, random-looking hash. # # The general issue: encode the 32 bits of an IPv4 address in a printable, # reversible hash with enough randomization to produce many permutations, and # which doesn't look too obviously like it has an IP address encoded in it. # # This approach could be improved, but does the job adequately while # accomplishing the above goals. # # A teergrube-bait username splits each byte of the IP address into high # and low-order nybbles, placing the four high-order nybbles before the # four low-order ones (8 characters sofar). It then selects a random # 8-bit permutation value, and for every true bit in that value, inserts # a random letter into the corresponding spot in the encoded address. # The permutation value is then encoded in the same fashion as the address, # with its high and low nybble encoded into two characters which are then # prepended to the address. In each case, nybbles are added to 97 ('a') # to render them printable. # # hence, the result: # # {p.h + 'a'}{p.l + 'a'} # [{0,25} + 'a']{(a1>>4) + 'a'} # [{0,25} + 'a']{(a2>>4) + 'a'} # ... # [{0,25} + 'a']{(a1&0xf) + 'a'} # [{0,25} + 'a']{(a2&0xf) + 'a'} # ... # @teergrubehost.domain.tld # # Where a1,a2,a3,a4 are the four octets of the IPv4 address, and the presence # of the character in [] brackets is dictated by whether a 1 is present in # the permutation value. # # This hash may be reversed with decode_teergrube.pl, included with sugarplum. sub teergrube_username { my @addr = split(/\./,shift || $ENV{REMOTE_ADDR} || '127.0.0.1'); my $packed = pack('c*', map { $_+97 } ((map { $_ >> 4 } @addr), (map { $_ & 0xf } @addr))); my $permutation = int rand 255; my $uname = pack('cc', 97+($permutation>>4),97+($permutation&0xf)). $packed; my $offset = 2; for (0..7) { if (($permutation & 1<<$_)) { my $rc = chr(97 + int rand 25); $uname = substr($uname,0,$_+$offset) . $rc. substr($uname,$_+$offset); $offset++; } } $uname; } # $Log: poison,v $ # Revision 1.13 2002/09/27 11:16:29 aqua # *** empty log message *** # # Revision 1.12 2000/12/28 11:11:34 aqua # - added teergrube address generation (default off) # - added deterministic mode (default off) # - adjusted frequency of conjunction()-inserted words upward # - cleaned up a few C-style for loops to perl list style # # Revision 1.11 2000/11/22 21:20:05 aqua # increment version for release. # # Revision 1.10 2000/11/22 21:14:09 aqua # Added Last-Modified header computation contributed by # Eric Eisenhart # # Added a variation on dictionary-word username generation contributed # by Richard Balint # # Adjusted conjunction frequency upward. # # Added duly-randomized and tags that should correlate # with the HTML produced by sugarplum. # # Added UID/GID reporting to dict-open failure. # # Revision 1.9 1999/06/04 23:20:28 aqua # added teleport-28 to dos_agent_patterns # # Revision 1.8 1999/06/04 23:14:19 aqua # Added background randomization option # # Revision 1.7 1999/06/04 22:51:54 aqua # Er, maybe _now_ the link-in-link is fixed. Found a better # way, and the old way wasn't working anyway. # # Revision 1.6 1999/06/04 22:42:37 aqua # Fixed mailto: in href problem (reported by Alexander Kourakos), # added head section, some randomly-chosen meta headers, fixed # problem with h1 headings getting lines broken and making a mess. # # Revision 1.5 1999/06/01 23:53:04 aqua # Added $VERSION # # Revision 1.4 1999/06/01 22:37:23 aqua # Added spambot agent patterns # # Revision 1.3 1999/06/01 21:59:11 aqua # Added env sanitizing to satisfy -T # # Revision 1.2 1999/06/01 21:44:56 aqua # Changed default loglevel # # Revision 1.1 1999/06/01 11:06:05 aqua # Initial revision #