#Oracal by Dale Swanson December 31st 2008
#accept a question as input and then form a response from sentences gathered from a Google search
#requires you have lynx installed
# http://www.pervalidus.net/cygwin/lynx/

#Here's my outline:
#A Perl program accepts as input a line of text (to be copied by a human from some discussion), possibly taking in a whole paragraph from an input file.  Perl creates a google search url for that line, and sends that command to lynx to dump the web results to a file.  Then it opens the web results and sends each of the pages to lynx for dumping to files.  Once it has the files with the content of the web pages it opens each and takes anything that looks like a logical sentence and creates an array of them.  Then it randomly selects about 10 sentences and outputs them to a final output file.  A human has to copy and paste the reply into the forum.
#lynx -dump http://www.google.com/search?q=monkeys >goog.txt

#To Do:
#integrate with other programs that automatically get a question, or submit an answer
#consider adding misspellings to filter file
#answer - convert to all lower case, or some other normalized format, perhaps edit text in some way to hamper searching.
#add exemption to common abbreviations from being split like sentences (ex Mr. Swanson)
#add something from the question to the answer file name, maybe just first word
#figure out a way to get lynx to timeout on slow webpages
#automatically use google's recomendation
#add some randomness to number of senteces in answer
#parenthesis in question seems to cause bugs

#2009.09.12 At some point Google changed 'Similar Pages' to just 'Similar', breaking the program.
#Also they removed the size of the html, in the results (" - 18k - ").
#I've decided to fix this by using the link dump, which previously had been ignored.

#!/usr/bin/perl
use strict;
use warnings;

use Cwd;

my $debug = 0; #debug mode, set to 1 to get lot's of output
my $quesfilename = "question.txt"; #the name of the file which has the question in it
my $lynxpath = "C:\\Program Files\\lynx\\"; #the path where lynx is installed
my $numberofsites = 14; #how many sites to try to get (+1 due to using 0-$numberofsites)
my $numberofsentences = 7; #how many sentences should be in the answer
my $answerrandomness = 5; #plus or minus how many sentences should be in the answer
my $answerfile = "answer." . time . ".txt"; #filename of the answer with the unique timestamp added
#my $answerfile = "answer.txt"; #uncomment this line to omit time
my $minlinelength = 100; #minimum number of bytes a single line must have (line is multiple sentences until the first newline)
my $maxlinelength = 10000; #maximum number of bytes for a single line
my $minsentencelength = 25; #minimum length in bytes for a sentence
my $maxsentencelength = 300; #maximum length for a sentence
my $maxquestionlength = 7; #maximum length of the question, in words, if over will pick the longest words

my $question; #the actual question being asked
my @questionwords; #question split up into words
#my @temparray; #temporary array used for sorting, not used due to @unique
my $dir; #the directory the program is running from
my @filearray; #will store the contents of the file
my $fileline; #will store each line of the file
my $flag; #a flag to determine if we have reached the search results yet
my @answerurls; #array to store the urls of the web results where we will get our answers from
my $url; #stores various urls
my $urlcount; #a counter of how many answer urls we have
my $x; #counter for loops
my $filterline; #line of the filter file
my @filterlist; #the list of words to filter by
my $filterword; #single word from filter
my @sentencepool; #pool of sentences to be drawn from when picking the random sentences
my @paragraph; #will hold multiple sentences from our answer results
my $sentence; #sentence from paragraph array
my $lettercount; #number of letters a-z, A-Z in the sentence

$numberofsentences += int(rand($answerrandomness * 2) - $answerrandomness);
print "\nNumber of sentences - $numberofsentences";

$dir = cwd(); #starting directory
open(quesfile, $quesfilename) or die("Error: cannot open file $quesfilename\n"); #opens the file with our question
$question = <quesfile>; #grabs the first line and uses it as a question
close(quesfile);
chomp($question); #gets rid of newlines
$question = lc($question); #converts to all lower case

open(filter, "filter.txt") or die("Error: cannot open file filter.txt\n"); #opens the file with our question
$filterline = <filter>; #grabs the first line and uses it as a question
close(filter);
chomp($filterline); #gets rid of newlines
$filterline = lc($filterline); #converts to all lower case
@filterlist = split(/\,/, $filterline); #separate our filter into words

$question =~ s/[^a-zA-Z0-9\s]//g;#filters anything that's not a letter, digit, or space.  Might be a good idea to replace with spaces and then remove excessive spaces later

foreach $filterword (@filterlist)
{#go through each word in our filter file and see if it's in the question
	$question =~ s/\s$filterword\s/ /g; #find this filter word in the question and replace it with nothing
}

@questionwords = split(/ /, $question); #break the question down into words
if (@questionwords > $maxquestionlength)
{#the question is too long, so we will take the longest words and use them
	my %hash = map {$_, 1} @questionwords; #convert array to a hash so we can remove duplicate words
	my @unique = keys %hash; #makes an array containing only the unique words
	@questionwords = sort {length($b) <=> length($a)} @unique; #sort the array by character length, longest to shortest
	$question = ""; #blank the question so we can add just the longest words
	for ($x = 0; $x <= $maxquestionlength ; $x++)
	{#add the longest words up to the limit set
		$question .= $questionwords[$x] . " "; #add the word
	}
}
print "\nQuestion is:\n$question";
if(! -e "$dir/files/")
{#checks to see if /files/ exists, and if not creates it
	mkdir ("$dir/files/");
}

#makes the batch file that will run lynx
open(ofile, ">files/lynxbatch.bat"); #output file name
print ofile ("cd $lynxpath\n"); #change directory to the lynx directory so we can run the file
print ofile ("lynx -dump -width=9999 \"http://www.google.com/search?q=$question&num=100\" >\"$dir/files/goog.txt\"\n"); #actual command to get webpage
print ofile ("cd \"$dir\"\n"); #go back to the directory the perl script is running in
close(ofile);

print "\nPerforming Google Search...";
system "$dir/files/lynxbatch.bat"; #run the batch file to do the google search

print "\nProcessing Results...";
#used to harvest urls from the search results, but now grab them from the link dump lynx puts at the end of the file
open(ifile, "files/goog.txt"); #opens the results of our google search
@filearray = <ifile>; #loads the contents of the google search into an array
$flag = 0; #flag will be used as a boolean flag to store if we've reached the search results yet
$urlcount = 0; #a counter of how many answer urls we've gotten so far
foreach $fileline (@filearray)
{#go through each line of the google results (which is in @filearray), so we can search it for urls
	if ($fileline =~ m/Visible links/)
	{#we've found the link dump at the end of the file
		(print "\nFound:\n\n$fileline\n") if ($debug);
		$flag = 1;#we use our flag to show that we've found the link dump
	}
	if ($flag)
	{#if we've already reached the link dump then we can start harvesting possible urls
		#if ($fileline =~ m/www\.|\.org|\.com|\.net|\.gov|\.edu/) #Not used because it gave false positives such as CNN.com in titles
		if ($fileline =~ m/http/)
		{#the link dump is almost all links (1 per line), matching http is barely needed
			#consider whitelisting file types (html, etc) instead of blacklisting below, downloading a binary file is unpleasant.
			if ($fileline !~ m/google|youtube|search|pdf$|ppt$|\.\.\./)
			{#filters results to exclude google pages and others
				#This should remove adlinks and other stuff because they have google referrers, who knows though.
				#Somtimes Google uses their IP instead of domain, consider matching any IP or stuff like 'search?q=cache'
				(print "\nFound URL:\n\n$fileline\n") if ($debug);
				# 230. http://www.chicagocares.org/  <- that is the format of all lines, just need to remove space(s), digits, period, space.
				$url = $fileline; #the fileline contains our url, just need to strip out the numbers
				$url =~ s/^\s+\d+\.\s//;#If Odin smiles on you this will strip spaces, digits, period, space from the start
				chomp($url); #evil newlines
				if ((int(rand(3)) == 0) &&($urlcount <= $numberofsites))
				{#adds this url to the list of answer urls, has a 1 in 4 random chance
					push(@answerurls, $url); #adds this url to our list of answerurls
					$urlcount++; #counts this url
				}
				(print "\nExtracted URL:\n\n$url\n") if ($debug);
			}
		}
	}
}
close(ifile);
print "\nAnswer URLs:\n@answerurls";

#makes the batch file that will run lynx
open(ofile, ">files/lynxbatcha.bat"); #output file name
print ofile ("cd $lynxpath\n"); #change directory to the lynx directory so we can run the file
for ($x = 0; $x <= $numberofsites ; $x++)
{#go through our answer urls and make the lynx command for each
	print ofile ("lynx -dump -width=9999 -nolist -connect_timeout=5 -timeout=5 \"$answerurls[$x]\" >\"$dir/files/answer$x.txt\"\n"); #the command to download this webpage
}
print ofile ("cd $dir\n"); #go back to directory the perl script is in
close(ofile);

print "\nDownloading Answers...";
#***comment out next line to skip downloading answer pages, and speed up program***
system "$dir/files/lynxbatcha.bat"; #run the batch file to get the answer pages
print "\nProcessing Results...";

for ($x = 0; $x <= $numberofsites ; $x++)
{#go through each result and grab the lines
	open(ifile, "files/answer$x.txt"); #opens the webpages we downloaded that contain our answer sentences
	@filearray = <ifile>; #loads this webpage into an array, with each line being one element
	foreach $fileline (@filearray)
	{#go through each line of the file, determine if it's good
		if (length($fileline) >= $minlinelength && length($fileline) <= $maxlinelength)
		{#line is within the acceptable range for length
			(print "\nFileline is OK:\n$fileline\n") if ($debug && int(rand(100)) == 0);
			@paragraph = split(/\.\s/, $fileline); #splits the fileline into sentences in the paragraph array, just uses '. ' (period space) as the delimiter, seems to be sufficient
			(print "\nSample Paragraph:\n@paragraph\n") if ($debug && int(rand(100)) == 0);
			foreach $sentence (@paragraph)
			{#go through each sentence and dermine if it's good
				chomp($sentence); #strip newlines
				$sentence =~ s/^\s+//;#strip leading spaces
				$sentence =~ s/\s+$//;#strip trailing spaces
				$sentence =~ s/\s+/ /g;#strip multiple spaces
				if (length($sentence) >= $minsentencelength && length($sentence) <= $maxsentencelength)
				{#sentence is within acceptable range for length
					#other tests go here
					#possible other tests,  ratio of normal chars (a-z,A-Z) vs all other chars.
					#require some of the words from the filter file
					#filter some words common for system messages, "reload this page", "reply to this", "is offline", "Reply With Quote", "Threaded Mode", "posted 2004-Mar-23", "Powered By IP.Board", "Email this Page, Search for:"
					#consider using an external list, or at least an array for the phrases to filter
					$lettercount = ($sentence =~ tr/([a-zA-Z])//); #counts the number of letters a-z, A-Z in the sentence
					$lettercount++; #increment to prevent x/0
					if ($sentence !~ m/\s\|\s/ && # ' | ' is often used between text menus, thus we filter it
					length($sentence) / $lettercount < 2 && #ratio of letters to nonletters,
					$sentence !~ m/reply with quote/i &&
					$sentence !~ m/reload this page/i &&
					$sentence !~ m/reply to this/i &&
					$sentence !~ m/Log In/i &&
					$sentence !~ m/Originally Posted by/i) 
					{
						#the sentence has passed all our tests, so we will add it to the array
						$sentence = $sentence . ". "; #adds a period and space since they were striped before
						(print "\nSentence is OK:\n$sentence\n") if ($debug && int(rand(100)) == 0);
						push(@sentencepool, $sentence); #this sentence is good, so we add it to the pool
					}
				}
			}
		}
	}
}

if ($debug)
{#if we are in debug mode we output the whole sentence pool to a text file
	open(ofile, ">files/sentencepool.txt"); #output file name
	print ofile (@sentencepool); #dump sentence pool to text file
	close(ofile);
}

open(ofile, ">$answerfile"); #output the answer
for ($x = 0; $x <= $numberofsentences ; $x++)
{#pick random sentences from our pool and write them to the answer file
	print ofile ($sentencepool[rand @sentencepool]); #puts a sentence in the answer file
}
close(ofile);
print "\nAnswer outputed to - $answerfile";
print "\nDone!\n";