#!/usr/bin/perl ######################### VARIABLES ###################### # $number_cites_wanted=300; $urls_per_altavista_page = 10; $orig_number_cites_wanted = $number_cites_wanted; $altavista_hits=0; $altavista_wordcount=0; $goodcontext=1; $contextsize=40; $number_of_other_results_pages_found = 0; $number_of_other_results_pages_needed = 0; $highest_number_URLs_allowed = 300; $URL_to_search = ""; $number_urls_searched = 0; $total_number_urls = 0; $inaccessible_server = 0; $accessible_server = 0; $fourofour = 0; $http_two_hundred = 0; $other_http_error_code = 0; $term_not_there = 0; $term_in_page = 0; $good_html_page = 0; $crap_html = 0; $title_hits = 0; $body_hits = 0; $neither_title_nor_body = 0; $non_meta_hits = 0; $metadata_hits = 0; $table_list_citation = 0; $potential_goodhits = 0; $goodhits = 0; $success = 0; $total_errors = 0; $separator = "******************************\n"; $http = "http\:\/\/"; # # # ######## ALTA VISTA RESULT PAGE REFERENCES ############## # # # URL example # b>URL: www.newadvent.org/cathen/p.htm # # next pages example # 19 $alta_vista = "www.altavista.com"; $null_search = "AltaVista found no document matching your query\. \n"; $single_page = "1 page found"; # ########################################################## $output_autoflush = 1; #open(TESTFILE, ">test"); $argument = $ARGV[0]; if (-e $argument) { open(FILE, $argument); @search_terms = ; } else { $search_terms[0] = $argument; } #print STDERR @search_terms; ########################################################### # MAIN PROGRAM # ########################################################### foreach $term (@search_terms) { if ($term =~ /\n/) { $term =~ s/\n//; } $term =~ s/[ ]+$//; $term =~ s/[ ]+/\+/g; $term =~ s/[\+]+/\+/g; $URL = "http://www.altavista.com/cgi-bin/query?mss=simple&pg=q&what=web&enc=iso88591&kl=XX&locale=xx&q=$term&search=Search"; # &start_output_page; # GETTING 1st ALTA-VISTA RESULTS PAGE @first_results_page = &get_page_by_URL($URL); # CHECK FOR SOCKET ERRORS # 1st POSSIBLE MISS $socket_error = $first_results_page[0]; if ($socket_error < 0) { print "\tAlta-Vista is currently down - too bad\n\n"; } else # if AltaVista is up { @altavista_stats = &get_altavista_stats(@first_results_page); $altavista_hits = @altavista_stats[0]; $altavista_wordcount = @altavista_stats[1]; $number_of_other_results_pages_found = @altavista_stats[2]; if ($altavista_hits == 0) { print "AltaVista contained no tokens of \"$term\"\n"; } else # there is a term or there are terms { if ($number_cites_wanted > $highest_number_URLs_allowed) { $number_cites_wanted = $highest_number_URLs_allowed; } if ($number_cites_wanted > $urls_per_altavista_page) { $number_of_other_results_pages_needed = int(($number_cites_wanted + ($urls_per_altavista_page - 1)) / $urls_per_altavista_page); if ($number_of_other_results_pages_needed > $number_of_other_results_pages_found) { $number_of_other_results_pages_needed = $number_of_other_results_pages_found; } } @other_results_pages = &get_other_resultpages($number_of_other_results_pages_needed, @first_results_page); @all_results_pages = (@first_results_page, @other_results_pages); @total_urls = &extract_urls_from_results_pages(@all_results_pages); if ($number_cites_wanted > $total_number_urls) { $orig_number_cites_wanted = $number_cites_wanted; $number_cites_wanted = $total_number_urls; } for ($i = $number_cites_wanted; $i >= 1; $i--) { $number_urls_searched++; $URL_to_search = shift(@total_urls); &find_target_page($URL_to_search, $term); } &print_output; } } } ###################################################### # SUB ROUTINES # ###################################################### # # sub get_altavista_stats # sub print_output # sub find_target_page # sub extract_urls_from_results_pages # sub get_other_resultpages # sub search_for_original_search_term # sub get_page_by_URL # sub start_output_page # sub end_output_page # sub remove_font_crap # sub delete_metadata_hits # sub delete_list_and_table_crap # sub delete_table_crap # sub delete_list_crap # sub tidy_sentence_breaks # sub delete_all_other_tags # ###################################################### sub get_altavista_stats { local(@first_results_page) = @_; local($line) = ""; foreach $line (@first_results_page) { if ($line eq $null_search) { $altavista_hits = 0; } if ($line =~ m/([0-9\,]+) pages found/i) { $altavista_hits = $1; $altavista_hits =~ s/\,//g; } if ($line eq $single_page) { $altavista_hits = 1; } if ($line =~ /word count.*\: ([0-9\,]+)/) { $altavista_wordcount = $1; } if ($line =~ m/URL\:<\/b> /) { $total_number_urls++; $line =~ /URL\:<\/b> (.*?)<\/font>/; $URL = "$http$1\n"; push (@URLS, $URL); } } return @URLS; } sub get_other_resultpages { local($number_of_other_results_pages_needed) = shift(@_); local(@first_results_page) = @_; local(@URL) = []; local($URL) = ""; local($needed_URL) = ""; local($i) = 0; foreach $line (@first_results_page) { if ($line =~ m/[0-9]/i) { $line =~ /[0-9]+<\/a>/i; $URL = "$http$alta_vista$1"; push (@URLS, $URL); } } for ($i = $number_of_other_results_pages_needed; $i > 0; $i--) { $needed_URL = shift(@URLS); @next_results_page = &get_page_by_URL($needed_URL); push (@extra_results_pages, @next_results_page); } return @extra_results_pages; } sub search_for_original_search_term { local($length) = 0; local($term) = shift(@_); if ($term =~ /\+/) { $term =~ s/\+/ /g; } local($url) = shift(@_); local($web_page) = join " ", @_, "[RECORDBREAKER]"; $/ = "[RECORDBREAKER]"; $length = length($web_page); # print STDOUT "$length \n $term \n $ url \n $web_page \n"; $web_page =~ s/\ \;/ /gi; $web_page =~ s/\cM/ /g; $web_page =~ s/\n/ /g; $web_page =~ s/\t/ /g; $web_page =~ s/[ ]+/ /g; $web_page =~ s/[ ]+(<\/P>)/$1/gi; $web_page =~ s/[ ]+(
)/$1/gi; $web_page =~ s/[ ]+(<\/HR>)/$1/gi; $web_page =~ s//
/gi; $web_page =~ s//
/gi; if ($web_page =~ /HTTP\/[0-9]\.[0-9] 200/) { $http_two_hundred++; # 4th POSSIBLE ERROR if ($web_page !~ m/$term/i) { $term_not_there++; # print "$separator NO CITATION for $term IN $url\n"; } else { $term_in_page++; if ($web_page =~ m//i) { $good_html_page++; $web_page =~ /(.*?)<TITLE>(.*?)<\/TITLE>(.*)/i; $prelim = $1; $title = $2; $body = $3; if ($body =~ m/$term/i) { $body_hits++; $removed_fonts = &remove_font_crap($body); $removed_meta = &delete_metadata_hits($removed_fonts); if ($removed_meta =~ m/$term/i) { $non_meta_hits++; $non_list_table = &delete_list_and_table_crap($removed_meta); if ($non_list_table =~ m/$term/i) { $potential_goodhits++; $neat_and_tidy = &tidy_sentence_breaks($no_html); while ($neat_and_tidy =~ s/(.*?[\.\?\!\n])//) { $sentence = $1; if ($sentence =~ m/$term/i) { $goodhits++; $sentence =~ s/\[stop\]/\./g; $sentence =~ s/\[quot\]/\"/g; $sentence =~ s/\[ellipsis\]/\.\.\./g; print $separator; print "\nCIT = $sentence\n"; print "LENGTH = $length\n"; print "URL = $url\n"; } } } else # 7th POSSIBLE ERROR - Cit was in a list or table { $table_list_citation++; # print "$separator CITATION IN TABLE/LIST AT $url\n"; } } else # 6th POSSIBLE ERROR - metadata hit { $metadata_hits++; # print "$separator CITATION IN METADATA, HREF, ETC., AT $url\n"; } } elsif ($title =~ m/$term/i) { # only return the <title> as citation if the body doesn't # contain the search term $title_hits++; $citation = $title; print $separator; print "TITLE CITITATION = $citation\n"; print "LENGTH = $length\n"; print "URL = $url\n"; } else { $neither_title_nor_body++; } } # 5th POSSIBLE ERROR - crap html else { $crap_html++; # print "$separator CRAP HTML at $url\n"; } } } # 3rd POSSIBLE ERROR, 404, etc. elsif ($web_page =~ /HTTP\/[0-9]\.[0-9] [345][0-9][0-9]/) { $fourofour++; # print "$separator ERROR FINDING THE PAGE AT $url\n"; } else { $other_http_error_code++; } } sub get_page_by_URL { local($URL) = @_; if ($URL !~ m/^http:\/\/.*/i) { return; } else { if ($URL =~ m/^http:\/\/([\w-\.]+):?(\d*)($|\/(.*))/) { $host = $1; $port = $2; $path = $3; if ($path eq "") { $path = '/'; } if ($port eq "") { $port = 80; } $path =~ s/#.*//; $AF_INET = 2; $SOCK_STREAM = 1; $sockaddr = 'S n a4 x8'; chop($hostname = `hostname`); ($name,$aliases,$proto) = getprotobyname('tcp'); ($name,$aliases,$port) = getservbyname($port,'tcp') unless $port =~ /^\d+$/; ($name,$aliases,$type,$len,$thisaddr) = gethostbyname($hostname); if (!(($name,$aliases,$type,$len,$thataddr) = gethostbyname($host))) { return -1; } $this = pack($sockaddr, $AF_INET, 0, $thisaddr); $that = pack($sockaddr, $AF_INET, $port, $thataddr); if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { $SOCK_STREAM = 2; if (!(socket(S, $AF_INET, $SOCK_STREAM, $proto))) { return -2; } } if (!(bind(S, $this))) { return -3; } if (!(connect(S,$that))) { return -4; } select(S); $| = 1; select(STDOUT); print S "GET $path HTTP/1.0\n\n"; @response = <S>; ($protocol, $status) = split(/ /, $response); close(S); # print STDOUT "<HI> @response <hi>\n"; return @response; } } } sub start_output_page { print "<HTML><HEAD><TITLE> ${term}: webcorpus search result"; print "

${term}: webcorpus search result

\n\n"; # print "${term}: webcorpus search result\n"; } sub end_output_page { print "
\n
HITS $hits "; print "WC $wordcount "; print "GOODIES $goodies "; print "DUDS $duds "; print "
\n"; } # foreach $line (split /<(body|p|td|br|li)( [^>]*)?>/i, $web_page) # { # $line =~ s/\cM$/\n/g; # $line =~ s/\cM/ /g; # $line =~ s/[ \t]+/ /g; # $line =~ s/<[^>]+>//gi; # # if ($line =~ m/(^.*)($term)(.*)/i) # { # # break a "hit" up into 3 parts so I can show it prettily in html table # $this_term=$2; # $pre=substr($1, length($pre)-$contextsize, $contextsize); # $post=substr($3, 0, $contextsize); #delete trailing part-words # $pre=~s/^[a-z\.,;:?\!\)]*//; # $post=~s/[^ \.,;:?\!\)]*$//; # $termcount++; # # if (scalar(split(/\s/, $pre)) + scalar(split(/\s/, $post)) > $goodcontext) # { # $goodies++; # if we're going to do duplicate-checking, we shoudl either do it here # (or store everything in an arrary and check at the end) # # print "
all"; # print "$pre${this_term}${post}\n"; # } # } # } # print $web_page; sub remove_font_crap { local($rec) = $_[0]; $rec =~ s///gi; $rec =~ s///gi; $rec =~ s///gi; $rec =~ s/<\/B>//gi; $rec =~ s/<\/I>//gi; $rec =~ s/<\/U>//gi; $rec =~ s///gi; $rec =~ s///gi; $rec =~ s/<\/FONT>//gi; return $rec; } sub delete_metadata_hits { local($rec) = $_[0]; $rec =~ s///gi; $rec =~ s//\[image\]/gi; $rec =~ s/(.*?)<\/A>/$1/gi; $rec =~ s/.*?<\/LINK>/ /gi; $rec =~ s// /gi; return $rec; } sub delete_list_and_table_crap { local($rec) = $_[0]; $no_tables = &delete_table_crap($rec); $no_lists = &delete_list_crap($no_tables); if ($no_lists =~ m///gi; $rec =~ s/<\/TABLE>//gi; $rec =~ s///gi; $rec =~ s/<\/TR>//gi; $rec =~ s/\.<\/TD>/\./gi; $rec =~ s/<\/TD>/\./gi; $rec =~ s// /gi; $rec =~ s/\.<\/TH>/\./gi; $rec =~ s/<\/TH>/\./gi; $rec =~ s//\[heading\] /gi; $rec =~ s/.*?<\/COLGROUP>//gi; $rec =~ s/.*?<\/COL>//gi; $rec =~ s/[ ]+/ /gi; return $rec; } sub delete_list_crap { local($rec) = $_[0]; $rec =~ s///gi; $rec =~ s/<\/UL>/\./gi; $rec =~ s///gi; $rec =~ s/<\/OL>/\./gi; $rec =~ s///gi; $rec =~ s/<\/DL>/\./gi; $rec =~ s/
/ /gi; $rec =~ s/
/ \- /gi; $rec =~ s/<\/DD>//gi; $rec =~ s/.*?<\/DIR>//gi; $rec =~ s/.*?<\/MENU>//gi; $rec =~ s/[ ]+(
  • )/$1/gi; $rec =~ s/[ ]+(<\/LI>)/$1/gi; $rec =~ s/\.(<\/LI>)/\./gi; $rec =~ s/(<\/LI>)/\./gi; $rec =~ s/\.(
  • )/\./gi; $rec =~ s/(
  • )/\./gi; return $rec; } sub tidy_sentence_breaks { local($rec) = $_[0]; # converting horizontal rules, line breaks and paragraph ends to fullstops $rec =~ s/
    /\. /gi; $rec =~ s/
    / | /gi; $rec =~ s/

    /

    /gi; $rec =~ s/

    <\/P>//gi; $rec =~ s/

    [ ]+<\/P>//gi; $rec =~ s/\.<\/P>/\. /gi; $rec =~ s/<\/P>/\. /gi; $rec =~ s/[ ]+(

    )/$1/gi; $rec =~ s/\.

    /\. /gi; $rec =~ s/

    /\. /gi; $rec =~ s/[ ]+/ /gi; $rec =~ s/\. \./\. /gi; $rec =~ s///gi; $rec =~ s/<\/BLOCKQUOTE>/\./gi; $rec =~ s///gi; $rec =~ s/<\/Q>/\./gi; $rec =~ s//\[heading\]/gi; $rec =~ s/<\/H[0-9]>/\./gi; $rec =~ s//\[caption\]/gi; $rec =~ s/<\/CAPTION>/\./gi; # converting fullstops in name abbreviations, URLs, etc. $rec =~ s/([A-Z])\.( [A-Z])\.( [A-Z][a-z]+)/$1\[stop\]$2\[stop\]$3/g; $rec =~ s/([A-Z])\.([A-Z])\.( [A-Z][a-z]+)/$1\[stop\]$2\[stop\]$3/g; $rec =~ s/( [A-Z])\.( [A-Z][a-z]+)/$1\[stop\]$2/g; $rec =~ s/([a-zA-Z]+)\.([cC][oO][mM])\.([a-zA-Z]+ )/$1\[stop\]$2\[stop\]$3/g; $rec =~ s/([a-zA-Z]+)\.([cC][oO][mM] )/$1\[stop\]$2/g; $rec =~ s/ \"([a-zA-Z])+\" / \[quot\]$1\[quot\] /g; $rec =~ s/\.[\.]+/\[ellipsis\]/g; $rec =~ s/([0-9])\.([0-9])/$1\[stop\]$2/g; return $rec; } sub delete_all_other_tags { local($rec) = $_[0]; $rec =~ s/<\!\-\-.*?\-\->/ /g; $rec =~ s/(.*?)<\/A>/$1/gi; $rec =~ s// /gi; $rec =~ s// /gi; $rec =~ s// /gi; $rec =~ s/.*?<\/SERVER>/ /gi; $rec =~ s// /gi; $rec =~ s/.*<\/MAP>/ /gi; $rec =~ s// /gi; $rec =~ s// /gi; $rec =~ s/.*<\/OBJECT>/ /gi; $rec =~ s// /gi; $rec =~ s/.*?<\/FORM>/ /gi; $rec =~ s// /gi; $rec =~ s/.*?<\/BUTTON>/ /gi; $rec =~ s/.*?<\/LABEL>/ /gi; $rec =~ s/.*?<\/SELECT>/ /gi; $rec =~ s// /gi; $rec =~ s/<\/OPTGROUP>/ /gi; $rec =~ s/.*?<\/TEXTAREA>/ /gi; $rec =~ s/
    .*?<\/FIELDSET>/ /gi; $rec =~ s/.*?<\/LEGEND>/ /gi; $rec =~ s///gi; $rec =~ s///gi; $rec =~ s/<\/FRAMESET>//gi; $rec =~ s/<\/IFRAME>//gi; $rec =~ s///gi; $rec =~ s/<\/EM>//gi; $rec =~ s///gi; $rec =~ s/<\/BLINK>//gi; $rec =~ s///gi; $rec =~ s/<\/SMALL>//gi; $rec =~ s///gi; $rec =~ s/<\/BIG>//gi; $rec =~ s///gi; $rec =~ s/<\/STRONG>//gi; $rec =~ s///gi; $rec =~ s/<\/STRIKE>//gi; $rec =~ s///gi; $rec =~ s/<\/S>//gi; $rec =~ s///gi; $rec =~ s/<\/TT>//gi; $rec =~ s///gi; $rec =~ s/<\/PRE>//gi; $rec =~ s/
    //gi; $rec =~ s/<\/CENTER>//gi; $rec =~ s///gi; $rec =~ s/<\/MARQUEE>//gi; $rec =~ s///gi; $rec =~ s/<\/CITE>//gi; $rec =~ s///gi; $rec =~ s/<\/CODE>//gi; $rec =~ s///gi; $rec =~ s/<\/SAMP>//gi; $rec =~ s///gi; $rec =~ s/<\/KBD>//gi; $rec =~ s///gi; $rec =~ s/<\/VAR>//gi; $rec =~ s///gi; $rec =~ s/<\/DFN>//gi; $rec =~ s/<\/BODY>//gi; $rec =~ s/<\/HEAD>//gi; $rec =~ s/<\/HTML>//gi; $rec =~ s///gi; $rec =~ s/<\/DIV>//gi; $rec =~ s///gi; $rec =~ s/<\/SPAN>//gi; $rec =~ s///gi; $rec =~ s/<\/INS>//gi; $rec =~ s///gi; $rec =~ s/<\/DEL>//gi; $rec =~ s///gi; $rec =~ s/<\/ACRONYM>//gi; $rec =~ s///gi; $rec =~ s/<\/ABBR>//gi; $rec =~ s/
    //gi; $rec =~ s/<\/ADDRESS>//gi; $rec =~ s///gi; $rec =~ s/<\/MULTICOL>//gi; $rec =~ s///gi; $rec =~ s/<\/SPACER>//gi; $rec =~ s///gi; $rec =~ s/<\/LAYER>//gi; $rec =~ s///gi; $rec =~ s/<\/ILAYER>//gi; $rec =~ s///gi; $rec =~ s/<\/NOLAYER>//gi; $rec =~ s/\"\;/\`/gi; return $rec; } # This program searches the first page of hits of an AltaVista search # # Written by # James Lambert - 15/9/99 # # additions by Adam Kilgarriff 26/10/99 # main changes: outputs an html page, tries to be clever about the object # that is " a linguistic context" # do we want ALL instances from a doc, or one per doc? # do we want only instances with decent linguistic context, or all? # what counts to keep -- maybe all of the following: # docs containing at least one good hit # docs with atleast one hit (with or without decent context) # timed-out items # no-such-page items # DUPLICATES -- # what do we check for duplicates -- probably, just the context # we shall be showing the lexicographer # how do we count duplicates -- eg if 2 docs are duplicates of # each other but each have 2 good instances... do we show # one of the good instances from each... how many duplicates # do we then record? # "documnets found" and "word count" from altavista response # we should also log "time taken" and put it in the output page # do we carry on paging through, looking at *all* hits, or do we set # some limit, eg first 200? (NEXT JOB is to get this to page thorugh # the "next" web pages - I've started but it's not working yet #output for multiple search terms needs multiple html files - haven't done that # good context is the number of words of context required to make a hit # lexicographically useful - should probably be 3 or more # contextsize - eg how many characters fit in half (or less) of # a netscape screen #sub find_next_lists #{ # foreach $line (@_) # { # if (/^\ \;([0-9]+)<\/a> //; # $nextref = "http://altavista.yellowpages.com.au/" . $nextref . "\""; # if ($1 > 1) # { # print "\n\n\n\n"; # push @next_list, $nextref; # } # } # return @next_list; # } # } #} close(TESTFILE);