#!/usr/bin/perl -Tw

# $Id: poison 116 2003-04-01 12:21:57Z aqua $

# sugarplum poison CGI
# Copyright (c) 1999 by Devin Carraway <sugarplum@devin.com>
# Freely distributable under terms of the GPL.

# The poison CGI does the actual dealings with spam spiders.
# It has the job of giving them something realistic to
# peruse, including valid-looking addresses with which to
# poison the spammer's database.

# For the obvious reason, this script should either be named
# innocuously, or else ScriptAliased in such a way as to hide
# its name.  Pick something suitably random.

# See http://www.devin.com/sugarplum/ for the rest.

# The user should adjust the configuration options below to
# suit their particular whims.  Also it may be desirable to
# alter the subroutine fake_address() if a particular spam
# collection system is in place on the site already.

use strict;

use Getopt::Long;
use Sys::Hostname;
use IO::File;

my $VERSION = 0.9.10;

# American TLDs for use generating random addresses.  Non-US TLDs omitted,
# on the presumption that it will reduce international bandwidth costs.
my @TLDs = ( 'com', 'org', 'net', 'edu', 'jp', 'co.cn' );
my @teergrube_address_fqdn = ( 'spamcatcher.teergrube.net', 'spamyourself.teergrube.net' );

my %opts = (
	    # log level (0 for no logging besides the webserver's own)
	    loglevel => 0,

	    # log file (if loglevel set true)
	    logfile => '/var/log/sugarplum.log',

	    # Should sugarplum work deterministically?  If set true, a given
	    # URL will always seed perl's RNG with a consistent value (sum of
	    # the ordinal values of the bytes in the PATH_INFO variable) --
	    # consequently the same random values will be picked, so multiple
	    # reloads of any given URL in sugarplum will produce identical
	    # output.  Hence if a harvester were to check for poison by
	    # comparing data on subsequent loads, they won't be tipped off.
	    deterministic => 1,

	    # If set, the system hostname will be included in the seed in
	    # deterministic mode.  This introduces variance into different
	    # sugarplum installations with identical paths (e.g. /sugarplum/),
	    # thus preventing an attacker from doing hash-based poison
	    # avoidance.  This is desirable in all cases other than when
	    # using sugarplum on sites run on clustered servers, where the
	    # hostname may not be consistent across multiple loads of the
	    # same URI.
	    deterministic_by_hostname => 1,

	    # If set, the HTTP server name will be rolled into the seed
	    # in deterministic mode.  This yields differing sugarplum
	    # output across different virtualhosts, or indeed across
	    # different servers.  A possible drawback is that it may be
	    # possible for an attacker to detect the poison programmatically
	    # on sites using wildcard-matched vhosts.
	    deterministic_by_httphost => 1,

	    # path to the dictionary
	    dictfile => '/usr/share/dict/words',
	    
	    # if true, entries from the above spammer email list will
	    # be included as addresses.  See poison_spammer_frequency.
	    use_spammerlist => 0,

	    # path to the known-spammer list, one email address per line.
	    # not needed if use_spammerlist is set false.
	    spammerfile => '/etc/sugarplum/spammers',

	    # if true, randomly generated email addresses will be
	    # included in the output.  See poison_address_frequency.
	    use_poison_addresses => 1,

	    # if true, generated addresses will be used based on those
	    # inside your teergrube -- see address_teergrube_frequency.
	    use_teergrube_addresses => 1,

	    # how many paragraphs of poisoned output?
	    poison_paragraphs => 8,

	    # should the background be randomized?  (arguably improves
	    # page plausibility, but often looks wonky to humans)
	    poison_random_background => 1,

	    # How deep should the URLs recurse (remember, this is
	    # an exponential factor)
	    poison_maxdepth => 9,

	    # Minimum number of words per paragraph?
	    poison_paragraph_wordcount_base => 50,

	    # Random range of words beyond the base?
	    poison_paragraph_wordcount_range => 75,

	    # Column before which lines in output should have newlines
	    # appended; simulates editor linewrap.  Set very high to
	    # disable linewrap.
	    poison_paragraph_linewrap_col => 78,

	    # What percentage of email addresses should be derived from a
	    # dictionary word? (range 0 to 1 inclusive)
	    poison_word_username_frequency => 0.6,
	    # and of those, what percentage should have numbers attached to the
	    # end (as with AOL and similar providers)?  (0-1 inclusive)
	    poison_wordnumber_username_frequency => 0.3,
	    # and if so, up to how many digits?
	    poison_wordnumber_username_maxdigits => 4,

	    # The below should add up to 1 in any combination.
	    # poison_spammer_frequency has been renamed --
	    # poison_address_frequency now specifies the frequency of all
	    # addresses, while address_spammer_frequency (below) denotes the
	    # portion of addresses taken from known spammers.

	    # Percentage of words which should be poisoned addresses (see
	    # below)
	    poison_address_frequency => 0.03,
	    # Percentage of words which should be normal words
	    poison_dictword_frequency => 0.97,
	

	    # Settings regulating what proportions of address types will be
	    # emitted -- should add up to 1.  Any zero value disables the
	    # feature.

	    # percentage of addresses emitted which should be taken from
	    # the list of known spammers (0 to disable)
	    address_spammer_frequency => 0.2,
	    # percentage of addresses emitted which should be generated in
            # the teergrube (randomuser@teergrube.domain.tld):
	    address_teergrube_frequency => 0.4,
	    # percentage of addresses emitted which should simply be totally
	    # random (0 to disable):
            address_random_frequency => 0.6,


	    # give a fully-qualified hostname, which will be
	    # used in generating tarpit addresses.
	    teergrube_address_fqdn => 'spamcatcher.teergrube.net',
);

sub read_config {
	my $fn = shift || return undef;
	my $f = new IO::File($fn) || die "$fn: $!";
	my $l = 0;

	while (<$f>) {
		$l++;
		next if /^(\s*#|\s*$)/;
		chomp;
		if (/^(\w+):\s+(\S.*)/) {
			$opts{$1} = $2;
		} else {
			die "malformed config '$_' in $fn:$l\n";
		}
	}
	1;
}

sub usage {
	print "usage: $0\n",
		"\t-h : help\n",
		"\t-c <configfile> | --config <configfile> : specify config file\n",
		"output options:\n",
		map { tr/_/-/; "\t--$_ <value>\n" } sort keys %opts;
	exit 0;
}


$ENV{PATH} = '/bin:/usr/bin';
$ENV{IFS} = ' ';
delete $ENV{ENV};
delete $ENV{CDPATH};

## work out a runtime configuration

my $config_fn;
GetOptions('config=s' => \$config_fn, 'c=s' => \$config_fn,
	   'h' => \&usage,
	   (map { my $k = $_; $k =~ tr/_/-/;
	   	("$k=s" => \$opts{$_}) } keys %opts)) || &usage;

unless ($config_fn) {
	for ('/etc/sugarplum/config',
		'/usr/local/etc/sugarplum/config') {
		if (-e $_) { $config_fn = $_; last }
	}
}
$config_fn and &read_config($config_fn);


my ($dict,$dict_size,$spammers,$spammers_size);

my $uri = $ENV{REQUEST_URI} || 'http://localhost.test/sugarplum/';
$uri .= '/' unless $uri =~ /\/$/;
my $depth = !$ENV{PATH_INFO} || ($ENV{PATH_INFO} =~ tr:/:/:);
my $okrecurse = ($depth < $opts{poison_maxdepth});

$dict = new IO::File($opts{dictfile}) || die "dictfile $opts{dictfile}: $!";
$dict_size = (stat($dict))[7] || die "dictfile $opts{dictfile} is empty";

if ($opts{use_spammerlist}) {
	$spammers = new IO::File($opts{spammerfile}) ||
		die "spammerfile $opts{spammerfile}: $!";
	$spammers_size = (stat($spammers))[7] ||
		die "spammerfile $opts{spammerfile} is empty";
}

print "Content-Type: text/html\n",
	"Last-Modified: ".scalar localtime(((stat($0))[9]-65536)+
		length($ENV{PATH_INFO} || $0)*1280),
	"\n\n";

if ($opts{deterministic}) {
	my $seed = 0;
	if ($opts{deterministic_by_hostname}) {
		$seed += $_ for (unpack('C*',hostname || ''));
	}
	if ($opts{deterministic_by_httphost}) {
		$seed += $_ for (unpack('C*',$ENV{HTTP_HOST} ||
						$ENV{SERVER_NAME} || ''));
	}
	$seed += $_ for (unpack('C*',$ENV{PATH_INFO} || $0));
	srand($seed);
}


## HTML generation thus commences

if (!int rand 2) {
	if (!int rand 3) {
		my @dt = ('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"'.
			' "http://www.w3.org/TR/REC-html40/loose.dtd">',
			'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">');
		print $dt[int rand ($#dt+1)],"\n";
	}
	print int rand 2 ? '<html>' : '<HTML>',"\n";
}

if (!int rand 2) {
  # sometimes have a head of varying elaboration
  print "<head>\n";
  my $indent = int rand 2 ? "\t" : ' ' x int rand 10;
  print $indent,&paragraph(1 + int rand 10,2,'title'),"\n";
  !int rand 3 and print $indent,"<meta name=\"robots\" content=\"noindex, nofollow\">\n"; # irony?
  !int rand 4 and print $indent,"<meta name=\"description\" content=\"",
                        &paragraph(1+int rand 20,2),"\">\n";
  !int rand 4 and print $indent,"<meta name=\"keywords\" content=\"",
                        &paragraph(1+int rand 20,2),"\">\n";
  print "</head>\n";
} elsif (int rand(1)<0.9) {
  # _almost_ always have a title.  :)
  print &paragraph(1 + int rand 10,2,'title'),"\n";  
}
$opts{poison_random_background} and !int rand 2 and
  printf "<body bgcolor=\"#%.2x%.2x%.2x\" text=\"#%.2x%.2x%.2x\">\n",
    int rand 0x100,int rand 0x100, int rand 0x100,
    int rand 0x100,int rand 0x100, int rand 0x100;
!int rand 2 and print &paragraph(1 + int rand 8,2,'h1'),"\n\n";
!int rand 3 and $okrecurse and do {
  my $x;
  print '<p>[ ';
  for (0..1+int rand 8) {
    $x = &random_word;
    print "<a href=\"$uri$x\">$x</a> | ";
  }
  $x = &random_word;
  print "<a href=\"$uri$x\">$x</a> ]</p>";
};

for (1..$opts{poison_paragraphs}) {
  print &paragraph($opts{poison_paragraph_wordcount_base} +
		   int rand $opts{poison_paragraph_wordcount_range},int rand 2);
}

if ($opts{loglevel}) {
	my $log = new IO::File('>>'.$opts{logfile}) || die "$opts{logfile}: $!";

	print $log join("\t",scalar localtime,
			     $ENV{REMOTE_HOST} || 'unknown host',
			     $ENV{REMOTE_ADDR} || 'unknown addr',
			     $ENV{HTTP_USER_AGENT} || 'unknown agent',
			     $uri),"\n";
}

exit 0;


sub paragraph {
  my $wordcount = shift;
  my $format = shift;
  my ($i,$i1,$x,$word,$capnext);
  my $buf = '';
  my $ll = 0;

  $wordcount ||= 50 + int rand 75;
  if (!$format) {
    # simple words, no formatting
    $buf = '<p>';
    $capnext = 1;
    for $i (1..$wordcount) {
      $word = &word;
      substr($word,0,1) =~ tr/a-z/A-Z/, $capnext=0 if $capnext;
      $buf .= $word.".",next if $i==$wordcount;
      $x = rand 1;
      if ($x<0.35 || $word =~ tr/</</) {	  
	$word .= ' ';
      } elsif ($x<0.4 and $okrecurse) {
	# twisting the knife
	$word = "<a href=\"$uri$word\">$word</a> ";
      } elsif ($x < 0.50) {
	# end of sentence
	$word .= (('.') x 20,		  
		  '.','.','!','?','?')[int rand 25].'  ';
	$capnext = 1;
      } elsif ($x < 0.56) {
	$word .= ', ';
      } elsif ($x < 0.58) {
	$word .= ': ';
      } elsif ($x < 0.60) {
	$word .= ' -- ';
      } elsif ($x < 0.97) {
	$word .= ' '.&conjunction.' ';
      } else {
	$word .= '; ';
      }
      if ($ll+length($word)>$opts{poison_paragraph_linewrap_col}) {
	$buf .= "\n";
	$ll = length($word);
      } else {
	$ll += length($word);
      }
      $buf .= $word;
    }
    $buf .= "</p>\n\n";
  } elsif ($format == 1) {
    # unordered list
    $buf = '<ul><li>';
    $capnext = 1;
    for $i (1..$wordcount) {
      $word = &word;
      substr($word,0,1) =~ tr/a-z/A-Z/, $capnext=0 if $capnext;
      $buf .= $word.".",next if $i==$wordcount;
      $x = rand 1;
      if ($x<0.50 || $word =~ tr/</</) {
	$word .= ' ';
      } elsif ($x < 0.60) {
	$word .= (('.') x 20,		  
		  '.','.','!','?','?')[int rand 25].'  ';
	$capnext = 1;
      } elsif ($x < 0.62) {
	$word .= ', ';
      } elsif ($x < 0.68) {
	$word .= ': ';
      } elsif ($x < 0.70) {
	$word .= ' -- ';
      } elsif ($x < 0.90) {
	$word .= ' '.&conjunction.' ';
      } else {
	$word .= "</li>\n<li>";
      }
      if ($ll+length($word)>$opts{poison_paragraph_linewrap_col}) {
	$buf .= "\n";
	$ll = length($word);
      } else {
	$ll += length($word);
      }
      $buf .= $word;
    }
    $buf .= "</li></ul>\n\n";
  } elsif ($format == 2) {
    # heading, e.h. h1; almost no punctuation, no line breaks, short
    $_[0] and $buf = "<$_[0]>";
    $capnext = 1 if rand(1) >= 0.5;
    for $i (0..$wordcount-1) {
      $word = &word(1);
      substr($word,0,1) = uc substr($word,0,1) if $capnext || !$i;
      $buf .= ' ' if $i;
      $buf .= $word;
    }
    $_[0] and $buf .= "</$_[0]>";
  }
  $buf;
}

sub conjunction {
  # no, these are not all conjunctions.  conjunction() provides an increased frequency
  # of those parts of speech which occur as primary connectors in English, and should
  # therefore appear also in convincing poison.
  my @words = ( 'is', 'was', 'are', 'am', 'be',
		'has', 'had', 'having',
		'and', 'not', 'nor', 'neither', 'either', 'or',
		'will', 'may', 'might','would','could','should','ought',
		'do', 'did', 'done', 'doing','does',
		'if', 'when', 'after', 'before', 'unless', 'until' );
  $words[int rand ($#words + 1)];
}

sub random_line {
	my $fh = shift || return undef;
	my $size = shift || return undef;
	my $l;

	until ($l) {
		seek($fh, int rand $size, 0)
			|| die "seek: $!";
		<$fh>;
		$l = <$fh>;
		# re-seek on comment lines
		if ($l) {
			chomp $l;
			$l =~ s/\s*#.*//;
		}
	}
	$l;
}
sub random_word { &random_line($dict, $dict_size) }
sub random_spammer { &random_line($spammers, $spammers_size) }

sub word {
  my ($i1,$x);
  my $word;
  my $realword_only = shift;
  
  $i1 = rand 1;
  if (!$realword_only && $i1<$opts{poison_address_frequency}) {
	$x = &address;
	$word = "<a href=\"mailto:$x\">$x</a>";
  } else {
    $word = &random_word;
  }
  $word;
}

sub address {
	my $n = rand 1;

	if ($opts{use_spammerlist} && $n<$opts{address_spammer_frequency}) {
		return &random_spammer;
	} elsif ($opts{use_teergrube_addresses} &&
			$n<($opts{address_spammer_frequency}+
				$opts{address_teergrube_frequency})) {
#		return &teergrube_username . '@' . $opts{teergrube_address_fqdn};
		return &teergrube_username . '@' . $teergrube_address_fqdn[int rand @teergrube_address_fqdn];
	}
	&fake_address;
}


sub fake_address {
  my @charset = ( 'a'..'z', 'a'..'z', 'a'..'z', 'A'..'Z', '0'..'9','-','.' );
  my ($i,$s);
  my $addr = &fake_username.'@';

  for ($i=0; $i<1+int rand 4; $i++) {
    $s = &random_word;
    $s =~ tr/a-z//cd;
    $addr .= $s;
    $addr .= '.';
  }
  $addr .= $TLDs[int rand @TLDs];
  $addr;
}

sub fake_username {
	my $un = '';
	my @charset = ( 'a'..'z', 'a'..'z', 'a'..'z', 'A'..'Z', '0'..'9','-','.' );

	if (rand 1 <= $opts{poison_word_username_frequency}) {
		$un = &random_word;
		$un .= ('a'..'z')[int rand 26]
			for (length($un)..int rand 14-length($un));
		if (rand 1 <= $opts{poison_wordnumber_username_frequency}) {
			$un .= (0..9)[int rand 10]
				for (0..int rand
				  $opts{poison_wordnumber_username_maxdigits});
		}
	} else {
		$un = ('a'..'z')[int rand 26];
		for (0..1+int rand 14) {
			$un .= $charset[int rand($#charset+1)];
		}
  	}
	$un;
}

# encode $ENV{REMOTE_ADDR} in a reversible, random-looking hash.
#
# The general issue: encode the 32 bits of an IPv4 address in a printable,
# reversible hash with enough randomization to produce many permutations, and
# which doesn't look too obviously like it has an IP address encoded in it.
#
# This approach could be improved, but does the job adequately while
# accomplishing the above goals.
#
# A teergrube-bait username splits each byte of the IP address into high
# and low-order nybbles, placing the four high-order nybbles before the
# four low-order ones (8 characters sofar).  It then selects a random
# 8-bit permutation value, and for every true bit in that value, inserts
# a random letter into the corresponding spot in the encoded address.
# The permutation value is then encoded in the same fashion as the address,
# with its high and low nybble encoded into two characters which are then
# prepended to the address.  In each case, nybbles are added to 97 ('a')
# to render them printable.
#
# hence, the result:
#
# 	{p.h + 'a'}{p.l + 'a'}
#		[{0,25} + 'a']{(a1>>4) + 'a'}
#		[{0,25} + 'a']{(a2>>4) + 'a'}
#		...
#		[{0,25} + 'a']{(a1&0xf) + 'a'}
#		[{0,25} + 'a']{(a2&0xf) + 'a'}
#		...
#		@teergrubehost.domain.tld
#
# Where a1,a2,a3,a4 are the four octets of the IPv4 address, and the presence
# of the character in [] brackets is dictated by whether a 1 is present in
# the permutation value.
#
# This hash may be reversed with decode_teergrube.pl, included with sugarplum.

sub teergrube_username {
        my @addr = split(/\./,shift || $ENV{REMOTE_ADDR} || '127.0.0.1');
        my $packed = pack('c*',
                        map { $_+97 }
                                ((map { $_ >> 4 } @addr),
                                 (map { $_ & 0xf } @addr)));
        my $permutation = int rand 255;
        my $uname = pack('cc',
                        97+($permutation>>4),97+($permutation&0xf)).
                        $packed;
        my $offset = 2;

        for (0..7) {
                if (($permutation & 1<<$_)) {
                        my $rc = chr(97 + int rand 25);
                        $uname = substr($uname,0,$_+$offset) .
                                        $rc.
                                substr($uname,$_+$offset);
                        $offset++;
                }
        }
        $uname;
}


# $Log: poison,v $
# Revision 1.13  2002/09/27 11:16:29  aqua
# *** empty log message ***
#
# Revision 1.12  2000/12/28 11:11:34  aqua
# - added teergrube address generation (default off)
# - added deterministic mode (default off)
# - adjusted frequency of conjunction()-inserted words upward
# - cleaned up a few C-style for loops to perl list style
#
# Revision 1.11  2000/11/22 21:20:05  aqua
# increment version for release.
#
# Revision 1.10  2000/11/22 21:14:09  aqua
# Added Last-Modified header computation contributed by
# Eric Eisenhart <sugarplum/at/eisenhart.com>
#
# Added a variation on dictionary-word username generation contributed
# by Richard Balint <richard.balint/at/notes.udayton.edu>
#
# Adjusted conjunction frequency upward.
#
# Added duly-randomized <doctype> and <html> tags that should correlate
# with the HTML produced by sugarplum.
#
# Added UID/GID reporting to dict-open failure.
#
# Revision 1.9  1999/06/04 23:20:28  aqua
# added teleport-28 to dos_agent_patterns
#
# Revision 1.8  1999/06/04 23:14:19  aqua
# Added background randomization option
#
# Revision 1.7  1999/06/04 22:51:54  aqua
# Er, maybe _now_ the link-in-link is fixed.  Found a better
# way, and the old way wasn't working anyway.
#
# Revision 1.6  1999/06/04 22:42:37  aqua
# Fixed mailto: in href problem (reported by Alexander Kourakos),
# added head section, some randomly-chosen meta headers, fixed
# problem with h1 headings getting lines broken and making a mess.
#
# Revision 1.5  1999/06/01 23:53:04  aqua
# Added $VERSION
#
# Revision 1.4  1999/06/01 22:37:23  aqua
# Added spambot agent patterns
#
# Revision 1.3  1999/06/01 21:59:11  aqua
# Added env sanitizing to satisfy -T
#
# Revision 1.2  1999/06/01 21:44:56  aqua
# Changed default loglevel
#
# Revision 1.1  1999/06/01 11:06:05  aqua
# Initial revision
#