#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Find;
use LWP::Simple;
use HTML::LinkExtor;
use URI::URL;

my (%options,
	);
use vars qw($directory $extension $frequency $depth
            $ignore $keywords $logfile $output
            $verbose $help $recurse @words $words
	    $URL %count $parser %Docs
	    $p $host
		);

my ($word);

Getopt::Long::Configure("bundling");
GetOptions(
 "h+"  => \$help,
 "v+"  => \$verbose,
 "r+"  => \$recurse, # For file searches
 "f+"  => \$frequency,
 "H+"  => \$host,
 "d:s" => \$directory,
 "e:s" => \$extension,
 "l:s" => \$logfile,
 "o:s" => \$output,
 "D:i" => \$depth, # For web searches
);

$help && Usage();
@words = @ARGV;
chomp @words;
Usage() unless @words;
$words = join '|', @words;
if ($logfile) {
	open (LOG, ">$logfile") || 
                     die("Could not open $logfile for writing: $!\n");
}# End if

if ($directory) {
	# Search local file system
	$directory =~ s!/$!!;
	$verbose && print "Searching $directory\n";
	$logfile && print LOG "Searching $directory\n";
	if ($recurse) {
	    find(\&search, $directory);
        } else {
		opendir (DIR, $directory) || die 
			"Could not read $directory: $!\n";
		my @files = readdir DIR;
		closedir DIR;
		@files = grep (-T "$directory/$_", @files);
		if ($extension) {
			$extension =~ s!\.!\\.!g;
			@files = grep (/$extension$/, @files) ;
		}
		$verbose && print "Found files: ", (join ", ", @files), "\n";

		search("$directory/$_") for @files;
	} # End else

} elsif (($URL = shift @words) =~ m!^(ht|f)tp://!)	{
	# Search WWW
	$p = HTML::LinkExtor->new();
	if ($host) {
		($host = $URL) =~ s!((.*?)//(.*?))/.*$!$1!;
		$verbose && print "Restricting queries to pages on $host\n";
	}

	searchpage(0, $URL);
	for (keys %Docs) {
		$verbose && print "Looked at $_\n";
	}
} else {
	Usage();
}

# Results 
if ($output)	{
	open (OUT, ">$output") || 
			die ("Could not open $output for writing: $!\n");
} # End if

if ($frequency)	{ # With -f
	for (sort keys %count)	{ # The keys of %count are file names
		for $word (sort keys %{$count{$_}})	{
			if ($output) {
				print OUT "$_|$word|$count{$_}{$word}\n";
				$verbose && print"$_|$word|$count{$_}{$word}\n";
			} else {
				print "$_|$word|$count{$_}{$word}\n";
			}  # End if..else ($output)
		} # End for words
	} # End for files
} else { # Without -f
	for (sort keys %count)	{
		if ($output) {
			print OUT "$_|";
			$verbose && print "$_|";
		} else {
			print "$_|";
		} # End else
		my $found = join ',', sort (keys %{$count{$_}});
		if ($output)	{
			print OUT "$found\n";
			$verbose && print "$found\n";
		} else {
			print "$found\n";
		}
	} # End for files
} #  End (not -f)

close LOG if $logfile;
close OUT if $output;

sub search {
	# Search a text file for a string
	my $file = $File::Find::name || shift;
	-B $file && return;
	my ($line);

	$verbose && print "Searching $file\n";
	open (FILE, $file);
	while ($line = <FILE>)	{
		if ($line =~/$words/)	{
			$verbose && print " $.:\t$line";
			map{$count{$file}{$_}++ if $line=~/$_/}@words;
		}
	}
	close FILE;
} # End sub search

sub searchpage	{
	my ($cur_depth, $url) = @_;
	my ($link, @links, $abs);

	$verbose && print "Looking at $url, at depth $cur_depth\n";
	$Docs{$url} = 1;

	return(0) if ($cur_depth > $depth);
	my $content = get($url);
	if ($content=~m/$words/is)	{
		map{my $tmp=($content=~s/($_)/$1/gis);
                         $count{$url}{$_}=$tmp if $tmp}@words;
	} # End if
	$p->parse($content);
	@links = $p->links;
	for $link (@links)  {
		$abs = url($link->[2], $url)->abs if
			 ($link->[0] eq 'a' && $link->[1] eq 'href');
		$abs =~ s/#.*$//;
		$abs =~ s!/$!!;

		# Skip some URLs
		next if $abs=~/^mailto/i;
		next if $abs=~/(gz|zip|exe|tar|Z)$/;
		next unless $abs;
		next unless ($abs =~ /^$host/);
		next if $abs=~/\?\S+?=\S+/;

		searchpage($cur_depth+1, $abs) unless 
			($Docs{$abs} || ($cur_depth+1 > $depth));
	}
} # End sub searchpage

sub Usage {
print <<EndUsage;

Usage: keywordsearch [-rfv] [-e suffix] [-k number] [-i ignorefile]
                     [-l log] [-o outfile] -d dir keywords
       keywordsearch [-fv] [-k number] [-l log] [-o outfile] 
                     [-i ignorefile] [-D depth] URL keywords
Options:
    -d          Directory to search
    -e          File extension to search (i.e. .txt .html)
                Default behavior is to search only text-type files
    -f          Output the frequency of the words
    -D          Depth of search, or how many levels down to go
                (Web search) (Default is 0)
    -r          Recurse subdirectories (File search)
    -l          Log file for problems found
    -o          Output file (Default is STDOUT)
    -v          Verbose
    -H          Restrict to the initial host (Web search)

EndUsage
exit(0);
}

=head1 NAME

keywordsearch - Searches a web site, or a local directory, for keyword(s)

=head1 DESCRIPTION

Simple command line tool for searching either a web site, or a local directory,
for documents that contain a particular word.

=head1 PREREQUISITES

Uses C<Getopt::Long>, C<LWP::Simple>, C<HTML::LinkExtor> and C<URI::URL>

=head1 COREQUISITES

None

=head1 README

Simple command-line tool for searching either a web site, or a local 
directory, for documents containing particular keyword(s).

=pod OSNAMES

Any

=pod SCRIPT CATEGORIES

Search

=head1 Author

Written by Rich Bowen <rbowen@rcbowen.com> for The Creative Group 
(<http://www.cre8tivegroup.com>)

=cut