home *** CD-ROM | disk | FTP | other *** search
- #!perl
- eval 'exec perl -S $0 "$@"'
- if 0;
- # -*- Mode: Perl -*-
- # $Basename: perlindex.PL $
- # $Revision: 1.2 $
- # Author : Ulrich Pfeifer
- # Created On : Mon Jan 22 13:00:41 1996
- # Last Modified By: Ulrich Pfeifer
- # Last Modified On: Wed Jun 18 19:43:36 2003
- # Language : Perl
- # Update Count : 302
- # Status : Unknown, Use with caution!
- #
- # (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
- #
- #
- # %SEEN is used to store the absolute pathes to files which have been
- # indexed. Probably this could be replaced by %FN.
- #
- # %FN $FN{'last'} greatest documentid
- # $FN{$did} a pair of $mtf and $filename where $mtf is the
- # number of occurances of the most frequent word in
- # the document with number $did.
- #
- # %IDF $IDF{'*all*'} number of documents (essentially the same as
- # $FN{'last'})
- # $IDF{$word} number of documents containing $word
- #
- # %IF $IF{$word} list of pairs ($docid,$tf) where $docid is
- # the number of a document containing $word $tf
- #
- use Fcntl;
- use less 'time';
- use Getopt::Long;
- use File::Basename;
- use Text::English;
-
- # NDBM_File as LAST resort
- package AnyDBM_File;
-
- @ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File) unless @ISA;
-
- for $mod (@ISA) {
- last if eval "require $mod"
- }
-
- package main;
-
- $p = 'w'; # compressed int patch available
- $nroff = '';
- $man1direxp = 'D:\Perl\man\man1';
- $man3direxp = 'D:\Perl\man\man3';
- @perllib = 'D:\Perl\lib';
- push @perllib,'D:\Perl\site\lib';
- $prefix = 'D:\Perl';
- $pager = 'more /e';
- use Term::ReadKey;
-
- $stemmer = \&Text::English::stem;
- # directory for the index
- $IDIR = $man1direxp;
- $IDIR =~ s:/[^/]*$::;
-
- $opt_index = ''; # make perl -w happy
- $opt_menu = 1;
- $opt_maxhits = 15;
- $opt_cbreak = 1;
- &GetOptions(
- 'index',
- 'cbreak!',
- 'maxhits=i',
- 'menu!',
- 'verbose',
- 'dict:i',
- 'idir=s',
- ) || die "Usage: $0 [-index] [words ...]\n";
-
- if (defined $opt_idir) {
- $IDIR = $opt_idir; # avoid to many changes below.
- }
-
- if (defined $opt_dict) {
- $opt_dict ||= 100;
- }
-
- if ($opt_index) {
- &initstop;
-
- tie (%IF, AnyDBM_File, "$IDIR/index_if", O_CREAT|O_RDWR, 0644)
- or die "Could not tie $IDIR/index_if: $!\n";
- tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_CREAT|O_RDWR, 0644)
- or die "Could not tie $IDIR/index_idf: $!\n";
- tie (%SEEN, AnyDBM_File, "$IDIR/index_seen", O_CREAT|O_RDWR, 0644)
- or die "Could not tie $IDIR/index_seen: $!\n";
- tie (%FN, AnyDBM_File, "$IDIR/index_fn", O_CREAT|O_RDWR, 0644)
- or die "Could not tie $IDIR/index_fn: $!\n";
-
- require "find.pl";
-
- unless (@ARGV) { # breaks compatibility :-(
- for $dir (@perllib) {
- print "Scanning $dir ... \n";
- &find($dir);
- }
- }
- for $name (@ARGV) {
- my $fns = $name;
- $fns =~ s:$prefix/::;
- next if $SEEN{$fns};
- next unless -f $name;
- if ($name !~ /(~|,v)$/) {
- $did = $FN{'last'}++;
- $SEEN{$fns} = &index($name, $fns, $did);
- }
- }
- untie %IF;
- untie %IDF;
- untie %FN;
- untie %SEEN;
- } elsif ($opt_dict) {
- tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_RDONLY, 0644)
- or die "Could not tie $IDIR/index_idf: $!\n".
- "Did you run '$0 -index'?\n";
- while (($key,$val) = each %IDF) {
- printf "%-20s %d\n", $key, $val if $val >= $opt_dict;
- }
- untie %IDF;
- } else {
- tie (%IF, AnyDBM_File, "$IDIR/index_if", O_RDONLY, 0644)
- or die "Could not tie $IDIR/index_if: $!\n".
- "Did you run '$0 -index'?\n";
- tie (%IDF, AnyDBM_File, "$IDIR/index_idf", O_RDONLY, 0644)
- or die "Could not tie $IDIR/index_idf: $!\n";
- tie (%FN, AnyDBM_File, "$IDIR/index_fn", O_RDONLY, 0644)
- or die "Could not tie $IDIR/index_fn: $!\n";
- &search(@ARGV);
- untie %IF;
- untie %IDF;
- untie %FN;
- untie %SEEN;
- }
-
- sub wanted {
- my $fns = $name;
-
- if ($name eq $man3direxp) {
- $prune = 1;
- }
- $fns =~ s:$prefix/::;
- return if $SEEN{$fns};
- return unless -f $_;
- if ($name =~ /man|bin|\.(pod|pm)$/) {
- if (!/(~|,v)$/) {
- $did = $FN{'last'}++;
- $SEEN{$fns} = &index($name, $fns, $did);
- }
- }
- }
-
- sub index {
- my $fn = shift;
- my $fns = shift;
- my $did = shift;
- my %tf;
- my $maxtf = 0;
- my $pod = ($fns =~ /\.pod|man/);
-
- open(IN, "<$fn") || warn "Could not open $fn: $!\n", return (0);
- while ($line = <IN>) {
- if ($line =~ /^=head/) {
- $pod = 1;
- } elsif ($line =~ /^=cut/){
- $pod = 0;
- } else {
- next unless $pod;
- }
- for $word (&normalize($line)) {
- next if $stop{$word};
- $tf{$word}++;
- }
- }
- for $tf (values %tf) {
- $maxtf = $tf if $tf > $maxtf;
- }
- for $word (keys %tf) {
- $IDF{$word}++;
- $IF{$word} = '' unless defined $IF{$word}; # perl -w
- $IF{$word} .= pack($p.$p, $did, $tf{$word});
- }
- close(IN);
- $FN{$did} = pack($p, $maxtf).$fns;
- print STDERR "$fns\n";
- 1;
- }
-
- sub normalize {
- my $line = join ' ', @_;
- my @result;
-
- $line =~ tr/A-Z/a-z/;
- $line =~ tr/a-z0-9/ /cs;
- for $word (split ' ', $line ) {
- $word =~ s/^\d+//;
- next unless length($word) > 2;
- if ($stemmer) {
- push @result, &$stemmer($word);
- } else {
- push @result, $word;
- }
- }
- @result;
- }
-
- sub search {
- my %score;
- my $maxhits = $opt_maxhits;
- my (@unknown, @stop);
-
- &initstop if $opt_verbose;
- for $word (normalize(@_)) {
- unless ($IF{$word}) {
- if ($stop{$word}) {
- push @stop, $word;
- } else {
- push @unknown, $word;
- }
- next;
- }
- my %post = unpack($p.'*',$IF{$word});
- my $idf = log($FN{'last'}/$IDF{$word});
- for $did (keys %post) {
- my ($maxtf) = unpack($p, $FN{$did});
- $score{$did} = 0 unless defined $score{$did}; # perl -w
- $score{$did} += $post{$did} / $maxtf * $idf;
- }
- }
- if ($opt_verbose) {
- print "Unkown: @unknown\n" if @unknown;
- print "Ingnore: @stop\n" if @stop;
- }
- if ($opt_menu) {
- my @menu;
- my $answer = '';
- my $no = 0;
- my @s = ('1' .. '9', 'a' .. 'z');
- my %s;
-
- for $did (sort {$score{$b} <=> $score{$a}} keys %score) {
- my ($mtf, $path) = unpack($p.'a*', $FN{$did});
- my $s = $s[$no];
- push @menu, sprintf "%s %6.3f %s\n", $s, $score{$did}, $path;
- $s{$s} = ++$no;
- last unless --$maxhits;
- }
- &cbreak('on') if $opt_cbreak;
- while (1) {
- print @menu;
- print "\nEnter Number or 'q'> ";
- if ($opt_cbreak) {
- read(TTYIN,$answer,1);
- print "\n";
- } else {
- $answer = <STDIN>;
- }
- last if $answer =~ /^q/i;
- $answer = ($s{substr($answer,0,1)})-1;
- if ($answer >= 0 and $answer <= $#menu) {
- my $selection = $menu[$answer];
- if ($selection =~ m:/man:) {
- my ($page, $sect) =
- ($selection =~ m:([^/]*)\.(.{1,3})$:);
- print STDERR "Running man $sect $page\n";
- system 'man', $sect, $page;
- } else {
- my ($path) = ($selection =~ m:(\S+)$:);
- $path = $prefix.'/'.$path;
- print STDERR "Running pod2man $path\n";
- system "pod2man --official $path | $nroff -man | $pager";
- }
- } else {
- my $path = $prefix."/bin/perlindex";
- system "pod2man --official $path | $nroff -man | $pager";
- }
- }
- &cbreak('off') if $opt_cbreak;
- } else {
- for $did (sort {$score{$b} <=> $score{$a}} keys %score) {
- printf("%6.3f %s\n", $score{$did},
- (unpack($p.'a*', $FN{$did}))[1]);
- last unless --$maxhits;
- }
- }
- }
-
- sub cbreak {
- my $mode = shift;
- if ($mode eq 'on') {
- open(TTYIN, "</dev/tty") || die "can't read /dev/tty: $!";
- open(TTYOUT, ">/dev/tty") || die "can't write /dev/tty: $!";
- select(TTYOUT);
- $| = 1;
- select(STDOUT);
- $SIG{'QUIT'} = $SIG{'INT'} = 'cbreak';
- ReadMode 3; # Set cbreak mode
- } else {
- ReadMode 0; # Restore non-cbreak mode
- }
- }
-
-
- $stopinited = 0; # perl -w
- sub initstop {
- return if $stopinited++;
- while (<DATA>) {
- next if /^\#/;
- for (normalize($_)) {
- $stop{$_}++;
- }
- }
- }
-
- =head1 NAME
-
- perlindex - index and query perl manual pages
-
- =head1 SYNOPSIS
-
- perlindex -index
-
- perlindex tell me where the flowers are
-
- =head1 DESCRIPTION
-
- "C<perlindex -index>" generates an AnyDBM_File index which can be
- searched with free text queries "C<perlindex> I<a verbose query>".
-
- Each word of the query is searched in the index and a score is
- generated for each document containing it. Scores for all words are
- added and the documents with the highest score are printed. All words
- are stemed with Porters algorithm (see L<Text::English>) before
- indexing and searching happens.
-
- The score is computed as:
-
- $score{$document} += $tf{$word,$document}/$maxtf{$document}
- * log ($N/$n{$word});
-
- where
-
- =over 10
-
- =item C<$N>
-
- is the number of documents in the index,
-
- =item C<$n{$word}>
-
- is the number of documents containing the I<word>,
-
- =item C<$tf{$word,$document}>
-
- is the number of occurances of I<word> in the I<document>, and
-
- =item C<$maxtf{$document}>
-
- is the maximum freqency of any word in I<document>.
-
- =back
-
- =head1 OPTIONS
-
- All options may be abreviated.
-
- =over 10
-
- =item B<-maxhits> maxhits
-
- Maximum numer of hits to display. Default is 15.
-
- =item B<-menu>
-
- =item B<-nomenu>
-
- Use the matches as menu for calling C<man>. Default is B<-menu>.q
-
- =item B<-cbreak>
-
- =item B<-nocbreak>
-
- Switch to cbreak in menu mode or dont. B<-cbreak> is the default.
-
- =item B<-verbose>
-
- Generates additional information which query words have been not found
- in the database and which words of the query are stopwords.
-
- =back
-
- =head1 EXAMPLE
-
- perlindex foo bar
-
- 1 3.735 lib/pod/perlbot.pod
- 2 2.640 lib/pod/perlsec.pod
- 3 2.153 lib/pod/perldata.pod
- 4 1.920 lib/Symbol.pm
- 5 1.802 lib/pod/perlsub.pod
- 6 1.586 lib/Getopt/Long.pm
- 7 1.190 lib/File/Path.pm
- 8 1.042 lib/pod/perlop.pod
- 9 0.857 lib/pod/perlre.pod
- a 0.830 lib/Shell.pm
- b 0.691 lib/strict.pm
- c 0.691 lib/Carp.pm
- d 0.680 lib/pod/perlpod.pod
- e 0.680 lib/File/Find.pm
- f 0.626 lib/pod/perlsyn.pod
- Enter Number or 'q'>
-
- Hitting the keys C<1> to C<f> will display the corresponding manual
- page. Hitting C<q> quits. All other keys display this manual page.
-
- =head1 FILES
-
- The index will be generated in your man directory. Strictly speaking in
- C<$Config{man1direxp}/..>
-
- The following files will be generated:
-
- index_fn # docid -> (max frequency, filename)
- index_idf # term -> number of documents containing term
- index_if # term -> (docid, frequency)*
- index_seen # fn -> indexed?
-
-
- =head1 AUTHOR
-
- Ulrich Pfeifer E<lt>F<pfeifer@ls6.informatik.uni-dortmund.de>E<gt>
-
- =cut
-
- __END__
- # freeWAIS-sf stopwords
- a
- about
- above
- according
- across
- actually
- adj
- after
- afterwards
- again
- against
- all
- almost
- alone
- along
- already
- also
- although
- always
- among
- amongst
- an
- and
- another
- any
- anyhow
- anyone
- anything
- anywhere
- are
- aren't
- around
- as
- at
- b
- be
- became
- because
- become
- becomes
- becoming
- been
- before
- beforehand
- begin
- beginning
- behind
- being
- below
- beside
- besides
- between
- beyond
- billion
- both
- but
- by
- c
- can
- can't
- cannot
- caption
- co
- co.
- could
- couldn't
- d
- did
- didn't
- do
- does
- doesn't
- don't
- down
- during
- e
- each
- eg
- eight
- eighty
- either
- else
- elsewhere
- end
- ending
- enough
- etc
- even
- ever
- every
- everyone
- everything
- everywhere
- except
- f
- few
- fifty
- first
- five
- vfor
- former
- formerly
- forty
- found "
- four
- from
- further
- g
- h
- had
- has
- hasn't
- have
- haven't
- he
- he'd
- he'll
- he's
- hence
- her
- here
- here's
- hereafter
- hereby
- herein
- hereupon
- hers
- herself
- him
- himself
- his
- how
- however
- hundred
- i
- i'd
- i'll
- i'm
- i've
- ie
- if
- in
- inc.
- indeed
- instead
- into
- is
- isn't
- it
- it's
- its
- itself
- j
- k
- l
- last
- later
- latter
- latterly
- least
- less
- let
- let's
- like
- likely
- ltd
- m
- made
- make
- makes
- many
- maybe
- me
- meantime
- meanwhile
- might
- million
- miss
- more
- moreover
- most
- mostly
- mr
- mrs
- much
- must
- my
- myself
- n
- namely
- neither
- never
- nevertheless
- next
- nine
- ninety
- no
- nobody
- none
- nonetheless
- noone
- nor
- not
- nothing
- now
- nowhere
- o
- of
- off
- often
- on
- once
- one
- one's
- only
- onto
- or
- other
- others
- otherwise
- our
- ours
- ourselves
- out
- over
- overall
- own
- p
- per
- perhaps
- q
- r
- rather
- recent
- recently
- s
- same
- seem
- seemed
- seeming
- seems
- seven
- seventy
- several
- she
- she'd
- she'll
- she's
- should
- shouldn't
- since
- six
- sixty
- so
- some
- somehow
- someone
- something
- sometime
- sometimes
- somewhere
- still
- stop
- such
- t
- taking
- ten
- than
- that
- that'll
- that's
- that've
- the
- their
- them
- themselves
- then
- thence
- there
- there'd
- there'll
- there're
- there's
- there've
- thereafter
- thereby
- therefore
- therein
- thereupon
- these
- they
- they'd
- they'll
- they're
- they've
- thirty
- this
- those
- though
- thousand
- three
- through
- throughout
- thru
- thus
- to
- together
- too
- toward
- towards
- trillion
- twenty
- two
- u
- under
- unless
- unlike
- unlikely
- until
- up
- upon
- us
- used
- using
- v
- very
- via
- w
- was
- wasn't
- we
- we'd
- we'll
- we're
- we've
- well
- were
- weren't
- what
- what'll
- what's
- what've
- whatever
- when
- whence
- whenever
- where
- where's
- whereafter
- whereas
- whereby
- wherein
- whereupon
- wherever
- whether
- which
- while
- whither
- who
- who'd
- who'll
- who's
- whoever
- whole
- whom
- whomever
- whose
- why
- will
- with
- within
- without
- won't
- would
- wouldn't
- x
- y
- yes
- yet
- you
- you'd
- you'll
- you're
- you've
- your
- yours
- yourself
- yourselves
- z
- # occuring in more than 100 files
- acc
- accent
- accents
- and
- are
- bell
- can
- character
- corrections
- crt
- daisy
- dash
- date
- defined
- definitions
- description
- devices
- diablo
- dummy
- factors
- following
- font
- for
- from
- fudge
- give
- have
- header
- holds
- log
- logo
- low
- lpr
- mark
- name
- nroff
- out
- output
- perl
- pitch
- put
- rcsfile
- reference
- resolution
- revision
- see
- set
- simple
- smi
- some
- string
- synopsis
- system
- that
- the
- this
- translation
- troff
- typewriter
- ucb
- unbreakable
- use
- used
- user
- vroff
- wheel
- will
- with
- you
-
-