#Oracal by Dale Swanson December 31st 2008 #accept a question as input and then form a response from sentences gathered from a Google search #requires you have lynx installed # http://www.pervalidus.net/cygwin/lynx/ #Here's my outline: #A Perl program accepts as input a line of text (to be copied by a human from some discussion), possibly taking in a whole paragraph from an input file. Perl creates a google search url for that line, and sends that command to lynx to dump the web results to a file. Then it opens the web results and sends each of the pages to lynx for dumping to files. Once it has the files with the content of the web pages it opens each and takes anything that looks like a logical sentence and creates an array of them. Then it randomly selects about 10 sentences and outputs them to a final output file. A human has to copy and paste the reply into the forum. #lynx -dump http://www.google.com/search?q=monkeys >goog.txt #To Do: #integrate with other programs that automatically get a question, or submit an answer #consider adding misspellings to filter file #answer - convert to all lower case, or some other normalized format, perhaps edit text in some way to hamper searching. #add exemption to common abbreviations from being split like sentences (ex Mr. Swanson) #add something from the question to the answer file name, maybe just first word #figure out a way to get lynx to timeout on slow webpages #automatically use google's recomendation #add some randomness to number of senteces in answer #parenthesis in question seems to cause bugs #2009.09.12 At some point Google changed 'Similar Pages' to just 'Similar', breaking the program. #Also they removed the size of the html, in the results (" - 18k - "). #I've decided to fix this by using the link dump, which previously had been ignored. #!/usr/bin/perl use strict; use warnings; use Cwd; my $debug = 0; #debug mode, set to 1 to get lot's of output my $quesfilename = "question.txt"; #the name of the file which has the question in it my $lynxpath = "C:\\Program Files\\lynx\\"; #the path where lynx is installed my $numberofsites = 14; #how many sites to try to get (+1 due to using 0-$numberofsites) my $numberofsentences = 7; #how many sentences should be in the answer my $answerrandomness = 5; #plus or minus how many sentences should be in the answer my $answerfile = "answer." . time . ".txt"; #filename of the answer with the unique timestamp added #my $answerfile = "answer.txt"; #uncomment this line to omit time my $minlinelength = 100; #minimum number of bytes a single line must have (line is multiple sentences until the first newline) my $maxlinelength = 10000; #maximum number of bytes for a single line my $minsentencelength = 25; #minimum length in bytes for a sentence my $maxsentencelength = 300; #maximum length for a sentence my $maxquestionlength = 7; #maximum length of the question, in words, if over will pick the longest words my $question; #the actual question being asked my @questionwords; #question split up into words #my @temparray; #temporary array used for sorting, not used due to @unique my $dir; #the directory the program is running from my @filearray; #will store the contents of the file my $fileline; #will store each line of the file my $flag; #a flag to determine if we have reached the search results yet my @answerurls; #array to store the urls of the web results where we will get our answers from my $url; #stores various urls my $urlcount; #a counter of how many answer urls we have my $x; #counter for loops my $filterline; #line of the filter file my @filterlist; #the list of words to filter by my $filterword; #single word from filter my @sentencepool; #pool of sentences to be drawn from when picking the random sentences my @paragraph; #will hold multiple sentences from our answer results my $sentence; #sentence from paragraph array my $lettercount; #number of letters a-z, A-Z in the sentence $numberofsentences += int(rand($answerrandomness * 2) - $answerrandomness); print "\nNumber of sentences - $numberofsentences"; $dir = cwd(); #starting directory open(quesfile, $quesfilename) or die("Error: cannot open file $quesfilename\n"); #opens the file with our question $question = ; #grabs the first line and uses it as a question close(quesfile); chomp($question); #gets rid of newlines $question = lc($question); #converts to all lower case open(filter, "filter.txt") or die("Error: cannot open file filter.txt\n"); #opens the file with our question $filterline = ; #grabs the first line and uses it as a question close(filter); chomp($filterline); #gets rid of newlines $filterline = lc($filterline); #converts to all lower case @filterlist = split(/\,/, $filterline); #separate our filter into words $question =~ s/[^a-zA-Z0-9\s]//g;#filters anything that's not a letter, digit, or space. Might be a good idea to replace with spaces and then remove excessive spaces later foreach $filterword (@filterlist) {#go through each word in our filter file and see if it's in the question $question =~ s/\s$filterword\s/ /g; #find this filter word in the question and replace it with nothing } @questionwords = split(/ /, $question); #break the question down into words if (@questionwords > $maxquestionlength) {#the question is too long, so we will take the longest words and use them my %hash = map {$_, 1} @questionwords; #convert array to a hash so we can remove duplicate words my @unique = keys %hash; #makes an array containing only the unique words @questionwords = sort {length($b) <=> length($a)} @unique; #sort the array by character length, longest to shortest $question = ""; #blank the question so we can add just the longest words for ($x = 0; $x <= $maxquestionlength ; $x++) {#add the longest words up to the limit set $question .= $questionwords[$x] . " "; #add the word } } print "\nQuestion is:\n$question"; if(! -e "$dir/files/") {#checks to see if /files/ exists, and if not creates it mkdir ("$dir/files/"); } #makes the batch file that will run lynx open(ofile, ">files/lynxbatch.bat"); #output file name print ofile ("cd $lynxpath\n"); #change directory to the lynx directory so we can run the file print ofile ("lynx -dump -width=9999 \"http://www.google.com/search?q=$question&num=100\" >\"$dir/files/goog.txt\"\n"); #actual command to get webpage print ofile ("cd \"$dir\"\n"); #go back to the directory the perl script is running in close(ofile); print "\nPerforming Google Search..."; system "$dir/files/lynxbatch.bat"; #run the batch file to do the google search print "\nProcessing Results..."; #used to harvest urls from the search results, but now grab them from the link dump lynx puts at the end of the file open(ifile, "files/goog.txt"); #opens the results of our google search @filearray = ; #loads the contents of the google search into an array $flag = 0; #flag will be used as a boolean flag to store if we've reached the search results yet $urlcount = 0; #a counter of how many answer urls we've gotten so far foreach $fileline (@filearray) {#go through each line of the google results (which is in @filearray), so we can search it for urls if ($fileline =~ m/Visible links/) {#we've found the link dump at the end of the file (print "\nFound:\n\n$fileline\n") if ($debug); $flag = 1;#we use our flag to show that we've found the link dump } if ($flag) {#if we've already reached the link dump then we can start harvesting possible urls #if ($fileline =~ m/www\.|\.org|\.com|\.net|\.gov|\.edu/) #Not used because it gave false positives such as CNN.com in titles if ($fileline =~ m/http/) {#the link dump is almost all links (1 per line), matching http is barely needed #consider whitelisting file types (html, etc) instead of blacklisting below, downloading a binary file is unpleasant. if ($fileline !~ m/google|youtube|search|pdf$|ppt$|\.\.\./) {#filters results to exclude google pages and others #This should remove adlinks and other stuff because they have google referrers, who knows though. #Somtimes Google uses their IP instead of domain, consider matching any IP or stuff like 'search?q=cache' (print "\nFound URL:\n\n$fileline\n") if ($debug); # 230. http://www.chicagocares.org/ <- that is the format of all lines, just need to remove space(s), digits, period, space. $url = $fileline; #the fileline contains our url, just need to strip out the numbers $url =~ s/^\s+\d+\.\s//;#If Odin smiles on you this will strip spaces, digits, period, space from the start chomp($url); #evil newlines if ((int(rand(3)) == 0) &&($urlcount <= $numberofsites)) {#adds this url to the list of answer urls, has a 1 in 4 random chance push(@answerurls, $url); #adds this url to our list of answerurls $urlcount++; #counts this url } (print "\nExtracted URL:\n\n$url\n") if ($debug); } } } } close(ifile); print "\nAnswer URLs:\n@answerurls"; #makes the batch file that will run lynx open(ofile, ">files/lynxbatcha.bat"); #output file name print ofile ("cd $lynxpath\n"); #change directory to the lynx directory so we can run the file for ($x = 0; $x <= $numberofsites ; $x++) {#go through our answer urls and make the lynx command for each print ofile ("lynx -dump -width=9999 -nolist -connect_timeout=5 -timeout=5 \"$answerurls[$x]\" >\"$dir/files/answer$x.txt\"\n"); #the command to download this webpage } print ofile ("cd $dir\n"); #go back to directory the perl script is in close(ofile); print "\nDownloading Answers..."; #***comment out next line to skip downloading answer pages, and speed up program*** system "$dir/files/lynxbatcha.bat"; #run the batch file to get the answer pages print "\nProcessing Results..."; for ($x = 0; $x <= $numberofsites ; $x++) {#go through each result and grab the lines open(ifile, "files/answer$x.txt"); #opens the webpages we downloaded that contain our answer sentences @filearray = ; #loads this webpage into an array, with each line being one element foreach $fileline (@filearray) {#go through each line of the file, determine if it's good if (length($fileline) >= $minlinelength && length($fileline) <= $maxlinelength) {#line is within the acceptable range for length (print "\nFileline is OK:\n$fileline\n") if ($debug && int(rand(100)) == 0); @paragraph = split(/\.\s/, $fileline); #splits the fileline into sentences in the paragraph array, just uses '. ' (period space) as the delimiter, seems to be sufficient (print "\nSample Paragraph:\n@paragraph\n") if ($debug && int(rand(100)) == 0); foreach $sentence (@paragraph) {#go through each sentence and dermine if it's good chomp($sentence); #strip newlines $sentence =~ s/^\s+//;#strip leading spaces $sentence =~ s/\s+$//;#strip trailing spaces $sentence =~ s/\s+/ /g;#strip multiple spaces if (length($sentence) >= $minsentencelength && length($sentence) <= $maxsentencelength) {#sentence is within acceptable range for length #other tests go here #possible other tests, ratio of normal chars (a-z,A-Z) vs all other chars. #require some of the words from the filter file #filter some words common for system messages, "reload this page", "reply to this", "is offline", "Reply With Quote", "Threaded Mode", "posted 2004-Mar-23", "Powered By IP.Board", "Email this Page, Search for:" #consider using an external list, or at least an array for the phrases to filter $lettercount = ($sentence =~ tr/([a-zA-Z])//); #counts the number of letters a-z, A-Z in the sentence $lettercount++; #increment to prevent x/0 if ($sentence !~ m/\s\|\s/ && # ' | ' is often used between text menus, thus we filter it length($sentence) / $lettercount < 2 && #ratio of letters to nonletters, $sentence !~ m/reply with quote/i && $sentence !~ m/reload this page/i && $sentence !~ m/reply to this/i && $sentence !~ m/Log In/i && $sentence !~ m/Originally Posted by/i) { #the sentence has passed all our tests, so we will add it to the array $sentence = $sentence . ". "; #adds a period and space since they were striped before (print "\nSentence is OK:\n$sentence\n") if ($debug && int(rand(100)) == 0); push(@sentencepool, $sentence); #this sentence is good, so we add it to the pool } } } } } } if ($debug) {#if we are in debug mode we output the whole sentence pool to a text file open(ofile, ">files/sentencepool.txt"); #output file name print ofile (@sentencepool); #dump sentence pool to text file close(ofile); } open(ofile, ">$answerfile"); #output the answer for ($x = 0; $x <= $numberofsentences ; $x++) {#pick random sentences from our pool and write them to the answer file print ofile ($sentencepool[rand @sentencepool]); #puts a sentence in the answer file } close(ofile); print "\nAnswer outputed to - $answerfile"; print "\nDone!\n";