home *** CD-ROM | disk | FTP | other *** search
- Newsgroups: alt.sources
- From: goer@ellis.uchicago.edu (Richard L. Goerwitz)
- Subject: kjv browser, part 4 of 11
- Message-ID: <1991Jul3.065038.28067@midway.uchicago.edu>
- Date: Wed, 3 Jul 1991 06:50:38 GMT
-
- ---- Cut Here and feed the following to sh ----
- #!/bin/sh
- # this is bibleref.04 (part 4 of a multipart archive)
- # do not concatenate these parts, unpack them in order with /bin/sh
- # file binsrch.icn continued
- #
- if test ! -r _shar_seq_.tmp; then
- echo 'Please unpack part 1 first!'
- exit 1
- fi
- (read Scheck
- if test "$Scheck" != 4; then
- echo Please unpack part "$Scheck" next!
- exit 1
- else
- exit 0
- fi
- ) < _shar_seq_.tmp || exit 1
- if test ! -f _shar_wnt_.tmp; then
- echo 'x - still skipping binsrch.icn'
- else
- echo 'x - continuing file binsrch.icn'
- sed 's/^X//' << 'SHAR_EOF' >> 'binsrch.icn' &&
- X#
- X# This file contains a single procedure, binary_index_search(str,
- X# filename), which goes through a file called filename looking for a
- X# line beginning with str. Note well that binary_index_search()
- X# assumes lines in filename will contain more than str. Str must
- X# occupy the first part of the line, separated from the remainder by
- X# a tab.
- X#
- X############################################################################
- X#
- X# Links: none
- X#
- X# See also: retrieve.icn, makeind.icn
- X#
- X############################################################################
- X
- X
- Xprocedure binary_index_search(entry, index_filename)
- X
- X local in_index, bottom, top, loc, incr, firstpart, offset
- X
- X in_index := open(index_filename) |
- X abort("binary_index_search","can't open "||index_filename,18)
- X
- X bottom := 1
- X seek(in_index, 0)
- X top := where(in_index)
- X
- X # If bottom gets bigger than top, there's no such entry.
- X until bottom > top do {
- X
- X loc := (top+bottom) / 2
- X seek(in_index, loc)
- X
- X # Move past next newline. If at bottom, break.
- X incr := 1
- X until reads(in_index) == "\n" do
- X incr +:= 1
- X if loc+incr = bottom then {
- X top := loc-1
- X next
- X }
- X
- X # Check to see if the current line starts with entry (arg 1).
- X read(in_index) ? {
- X
- X # .IND file line format is entry\tbitmap-file-offset
- X if entry == (firstpart := tab(find("\t"))) then {
- X # return offset
- X return (move(1), tab(0))
- X }
- X # Ah, this is what all binary searches do.
- X else {
- X if entry << firstpart
- X then top := loc-1
- X else bottom := loc + incr + *&subject
- X }
- X }
- X }
- X
- Xend
- SHAR_EOF
- echo 'File binsrch.icn is complete' &&
- true || echo 'restore of binsrch.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= bmp2text.icn ==============
- if test -f 'bmp2text.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping bmp2text.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting bmp2text.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'bmp2text.icn' &&
- X############################################################################
- X#
- X# Name: bmp2text.icn
- X#
- X# Title: convert a bitmap to a text-chunk
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.12
- X#
- X############################################################################
- X#
- X# This file contains bitmap_2_text(bitmap, filename). Recall that
- X# bitmaps are just a series of fixed-length bitfields used to mark
- X# divisions within a text. The procedure retrieve() finds words in
- X# an index file, and returns a list of these bitmaps, which point to
- X# divisions within the original text file - divisions within which a
- X# given indexed word found by retrieve() occurs. The procedure
- X# bitmap_2_filename() simply takes a given bitmap and finds the text
- X# with which it is associated in the full text file.
- X#
- X# Note that bitmap_2_text() does not seek directly to the correct
- X# location within "filename" (arg 2). It first breaks down the
- X# bitmap into a less precise form, looks up the location of that
- X# form, seeks up to its location, and then bumbles along until it
- X# reaches the chunk of text corresponding to the full "bitmap" (arg
- X# 1). The reason bitmap_2_text() does this is that makeind (the
- X# indexing routine which creates data files for retrieve() and
- X# bitmap_2_text()) does not store the offset within filename for
- X# every bitmap. It just saves the locations of major blocks. This
- X# is basically just a space-saving device. It would eat up too much
- X# memory (both disk and core) to keep a list of every offset for
- X# every chunk of text marked out by a bitmap in filename.
- X#
- X# Note also that, although retrieve() returns a list of bitmaps, bit-
- X# map_2_text(bitmap, filename) expects a single bitmap as its first
- X# argument. It is better that text be retrieved as needed, one chunk
- X# at a time, and not stuffed en masse into core memory as soon as it
- X# is retrieve()'d.
- X#
- X############################################################################
- X#
- X# Links: ./indexutl.icn, ./initfile.icn
- X#
- X# See also: retrieve.icn, makeind.icn
- X#
- X############################################################################
- X
- X# Declared in indexutl.icn.
- X# record is(FS, s_len, len, no, is_case_sensitive)
- X# global IS
- X
- X# Declared in initfile.icn.
- X# global filestats
- X# record Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
- X
- Xprocedure bitmap_2_text(bitmap, filename)
- X
- X local intext, cut_down_bitmap, upto_field, offset, line, value,
- X base_value_mask, base_value, location
- X static t
- X # global filestats, IS
- X initial t := table()
- X
- X # Check for sloppy programming.
- X /filename & abort("bitmap_2_text","you called me without a filename",29)
- X
- X # If necessary, initialize stats for the current file.
- X #
- X if /filestats | /filestats[filename]
- X then initfile(filename) # see initfile.icn
- X # Reset IS to current file.
- X IS := filestats[filename].IS
- X
- X # open full text file for reading
- X intext := open(filename) |
- X abort("bitmap_2_text", "can't open "||filename, 26)
- X
- X # Determine offset to seek to by using the bitmap->offset table
- X # for the current file (arg 2). The name of the bitmap_offset
- X # table is stored in filestats[filename].ofs_table.
- X #
- X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
- X cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len)
- X offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail
- X
- X # Seek to offset, and begin looking for the string equiv. of
- X # bitmap (arg 1).
- X #
- X seek(intext, offset) |
- X abort("bitmap_2_text","can't seek to offset "||offset, 27)
- X
- X #
- X # This works a lot like the routine in gettext.icn (another related
- X # retrieval package). Note that bitmaps in "filename" (arg 2) are on
- X # their own lines, preceded by a double colon.
- X #
- X # First figure out how to tell if we've gone too far. Basically,
- X # mask out the lower bits, and record the value of the upper bits.
- X # Some fooling around is necessary because bitmaps may use large
- X # ints, making it impossible to use icom() in a naive manner.
- X # If the upper bits of the bitmaps being read change, then we've
- X # gone too far.
- X #
- X base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1)
- X base_value := iand(bitmap, base_value_mask)
- X
- X while line := read(intext) do {
- X line ? {
- X if ="::" then {
- X location := digits_2_bitmap(tab(0)) # in indexutl.icn
- X if bitmap = location
- X then {
- X # Collect all text upto the next colon+colon-initial
- X # line (::) or EOF.
- X value := ""
- X while line := read(intext) do {
- X match("::",line) & break
- X value ||:= line || "\n"
- X }
- X # Note that a key with an empty value returns an
- X # empty string.
- X close(intext)
- X return trim(value, '\n')
- X }
- X else {
- X if base_value ~= iand(location, base_value_mask)
- X then fail
- X }
- X }
- X }
- X }
- X
- X # we should have returned by now
- X close(intext)
- X fail
- X
- Xend
- X
- SHAR_EOF
- true || echo 'restore of bmp2text.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= initfile.icn ==============
- if test -f 'initfile.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping initfile.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting initfile.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'initfile.icn' &&
- X############################################################################
- X#
- X# Name: initfile.icn
- X#
- X# Title: initialize entry for file in filestats table
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.9
- X#
- X############################################################################
- X#
- X# This file contains initfile(filename), which creates a set of stats
- X# for the indexed database contained in filename. Uses several global
- X# structures, primarily for speed. Beware.
- X#
- X############################################################################
- X#
- X# See also: retrieve.icn, bmp2text.icn, retrops.icn
- X#
- X############################################################################
- X
- X# Used to store stats for each filename.
- Xrecord Fs(ind_filename, bmp_filename, lim_filename, IS, ofs_table)
- X
- X# IS is declared in indexutl.icn.
- X# global IS
- X
- Xglobal filestats
- X
- Xprocedure initfile(filename)
- X
- X # Messy procedure which creates and stores the names of several
- X # files that will be repeatedly used with "filename." Reads in
- X # the stats for filename from that file's .IS file. Also reads in
- X # the bitmap->offset (.OFS file) table, and puts it into
- X # filestats[filename].ofs_table for later (re-)use.
- X
- X local IS_filename, in_IS, upto_field, stored_bitmap_length,
- X ofs_filename, intext, cut_down_bitmap, block_size, offset
- X # global filestats
- X initial {
- X filestats := table()
- X # OS-specific parameters are initialized here.
- X initialize_os_params() # in indexutl.icn
- X }
- X
- X # Check for sloppy programming. Did we do this one already??
- X if not (/filestats[filename] := Fs(,,,,table())) then fail
- X
- X filestats[filename].ind_filename :=
- X dir_name(filename)||create_fname(filename, "IND")
- X filestats[filename].bmp_filename :=
- X dir_name(filename)||create_fname(filename, "BMP")
- X filestats[filename].lim_filename :=
- X dir_name(filename)||create_fname(filename, "LIM")
- X
- X # Decode stored IS record for filename.
- X IS_filename := dir_name(filename)||create_fname(filename, "IS")
- X in_IS := open(IS_filename) | abort("bitmap_2_text",
- X "Can't open "||IS_filename||". Did you forget to index?", 24)
- X filestats[filename].IS := decode(!in_IS)
- X close(in_IS)
- X
- X # Having decoded IS, we can now determine the length of the cut-
- X # down bitmaps stored in the .OFS file for filename.
- X upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1
- X stored_bitmap_length :=
- X ((filestats[filename].IS.len * upto_field) <= seq(0,8))
- X
- X # open .OFS file
- X ofs_filename := dir_name(filename)||create_fname(filename, "OFS")
- X intext := open(ofs_filename) |
- X abort("bitmap_2_text", "can't open "||ofs_filename, 23)
- X
- X # read in blocks from .OFS file, breaking them into their
- X # constituent parts
- X while block_size := read_int(intext, 8) * 8 do {
- X cut_down_bitmap := read_int(intext, stored_bitmap_length)
- X offset := read_int(intext, block_size - stored_bitmap_length)
- X insert(filestats[filename].ofs_table, cut_down_bitmap, offset)
- X }
- X
- X close(intext)
- X # For lack of a better thing to return, return the size of
- X # the internal bitmap->offset table for filename.
- X return *filestats[filename].ofs_table
- X
- Xend
- SHAR_EOF
- true || echo 'restore of initfile.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= retrieve.icn ==============
- if test -f 'retrieve.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping retrieve.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting retrieve.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'retrieve.icn' &&
- X############################################################################
- X#
- X# Name: retrieve.icn
- X#
- X# Title: retrieve locations of words in database file
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.13
- X#
- X############################################################################
- X#
- X# Retrieve(pattern, filename) retrieves all locations containing
- X# words matching pattern (arg1) in filename (arg2), placing them in a
- X# list. "Locations" are integer-coded pointers to places in filename
- X# where corresponding text is located. To actually retrieve that
- X# block of text, you must call bitmap_2_text(location, filename).
- X# Retrieve() only gathers up a list of locations in filename
- X# containing words which match pattern.
- X#
- X# The reason retrieve() doesn't do the logical thing - namely, to
- X# "retrieve" text itself - is that doing so might use a *lot* of
- X# memory. It is far more economical to retrieve text only when a
- X# given chunk is requested via bitmap_2_text().
- X#
- X# The format for filename must conform to a simple, but strict, set
- X# of guidelines. Basically, it must interleave a series of keys
- X# (so-called "bitmaps") with actual text:
- X#
- X# ::001:001:001
- X# This is text.
- X# ::001:001:002
- X# This is more text.
- X#
- X# The lines beginning with :: (a double colon) are the keys. These
- X# translate into an integer dividable internally into (in this case)
- X# three bit-fields of length 10 (enough to handle 999:999:999), which
- X# serve as a location markers for the text that goes with them. See
- X# makeind.icn for a precise instructions on how to construct and index
- X# files.
- X#
- X# Note: Patterns must match words in their entirety. For instance,
- X# retrieve("dog",filename) would only retrieve exact matches for the
- X# word "dog" in filename. To catch, say, "doggie" as well, it would
- X# be necessary to call retrieve with a regular expression that
- X# matched both dog and doggie (e.g. retrieve("dog.*",filename)).
- X#
- X############################################################################
- X#
- X# Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn
- X# ./findre.icn
- X#
- X# See also: makeind.icn, bmp2text.icn
- X#
- X############################################################################
- X
- Xlink codeobj
- X
- X# The following globals contain stats for current file (here, arg2).
- X# global filestats # declared in initfile.icn
- X# global IS # declared in indexutl.icn
- X
- Xprocedure retrieve(pattern, filename, inverse)
- X
- X local bitmap_list, bmp_file, in_egrep, intext, cmd, offset, line
- X static is_UNIX, egrep_filename
- X initial {
- X if is_UNIX := find("UNIX",&features) then
- X # If egrep is available, use it. It's fast.
- X egrep_filename := "egrep"
- X # egrep_filename := "/usr/local/bin/gnuegrep"
- X }
- X
- X # Check for sloppy programming.
- X /filename & abort("retrieve","you called me without a filename",22)
- X
- X # Initialize important variables.
- X #
- X if /filestats | /filestats[filename]
- X then initfile(filename) # see initfile.icn
- X bitmap_list := list() # list will contain locations of hits
- X IS := filestats[filename].IS # re-initialize IS for current file
- X if /IS.is_case_sensitive then
- X pattern := map(pattern)
- X
- X # Open bitmap file.
- X #
- X bmp_file := open(filestats[filename].bmp_filename) |
- X abort("retrieve","can't open "||filestats[filename].bmp_filename, 29)
- X
- X # Search index.
- X #
- X if are_metas(pattern) then {
- X # NB: are_metas() can be found in indexutl.icn
- X
- X # If there are metacharacters in pattern, do a regexp pattern match.
- X # The .IND file goes: line ::= key \t other-stuff.
- X pattern := "^(" || pattern || ")\t"
- X
- X # If UNIX, then use egrep to search index.
- X #
- X if \is_UNIX then {
- X
- X # Set up command line to be passed to /bin/sh. If
- X # inverse is nonnull, invert the sense of the search
- X # (i.e. egrep -v).
- X if \inverse then {
- X cmd := egrep_filename || "-v '" || pattern ||
- X "' " || filestats[filename].ind_filename
- X } else {
- X cmd := egrep_filename || " '" || pattern ||
- X "' " || filestats[filename].ind_filename
- X }
- X # open pipe
- X in_egrep := open(cmd, "rp") |
- X abort("retrieve","can't open pipe from\n\t"||cmd, 20)
- X # grep .IND index file
- X every line := !in_egrep do {
- X line ? (tab(find("\t")+1), offset := integer(tab(0)))
- X bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
- X }
- X every close(bmp_file | in_egrep)
- X
- X # ...otherwise (i.e. if not UNIX) use findre() instead of egrep
- X #
- X } else {
- X
- X # Probably MS-DOS or something else. SLOW, SLOW!
- X intext := open(filestats[filename].ind_filename) |
- X abort("retrieve","can't open index file", 21)
- X # grep .IND file
- X if \inverse then {
- X # if inverse is nonnull, invert the sense of the search
- X every line := !intext do {
- X line ? {
- X if not findre(pattern) & tab(find("\t")+1) then {
- X bitmap_list |||:=
- X retrieve_bitmaps(integer(tab(0)), bmp_file)
- X }
- X }
- X }
- X } else {
- X # inverse is null; don't invert the sense of the search
- X every line := !intext do {
- X line ? {
- X if findre(pattern) & tab(find("\t")+1) then {
- X bitmap_list |||:=
- X retrieve_bitmaps(integer(tab(0)), bmp_file)
- X }
- X }
- X }
- X }
- X every close(bmp_file | intext)
- X
- X }
- X
- X # If *not* are_metas(pattern), then do a binary search of index.
- X # No need to worry about is_UNIX, egrep, findre(), etc.
- X #
- X } else {
- X
- X # If inverse is nonnull, invert the sense of the search
- X # (binary_index_search() may be found in binsrch.icn).
- X if \inverse then {
- X if not (offset :=
- X binary_index_search(pattern, filestats[filename].ind_filename))
- X then bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
- X } else {
- X if offset :=
- X binary_index_search(pattern, filestats[filename].ind_filename)
- X then bitmap_list |||:= retrieve_bitmaps(offset, bmp_file)
- X }
- X close(bmp_file)
- X }
- X
- X # We're done. See if there were any hits.
- X #
- X if *bitmap_list > 0
- X then return bitmap_list
- X else fail
- X
- Xend
- X
- X
- X
- Xprocedure retrieve_bitmaps(offset, f)
- X
- X local bitmap_list, bitmap_length, i
- X # global IS # contains stats for current file
- X
- X seek(f, offset)
- X bitmap_list := list()
- X bitmap_length := ((IS.len * IS.no) <= seq(0,8))
- X
- X every i := 1 to read_int(f, 16) do
- X put(bitmap_list, read_int(f, bitmap_length))
- X
- X return bitmap_list
- X
- Xend
- SHAR_EOF
- true || echo 'restore of retrieve.icn failed'
- rm -f _shar_wnt_.tmp
- fi
- # ============= indexutl.icn ==============
- if test -f 'indexutl.icn' -a X"$1" != X"-c"; then
- echo 'x - skipping indexutl.icn (File already exists)'
- rm -f _shar_wnt_.tmp
- else
- > _shar_wnt_.tmp
- echo 'x - extracting indexutl.icn (Text)'
- sed 's/^X//' << 'SHAR_EOF' > 'indexutl.icn' &&
- X############################################################################
- X#
- X# Name: indexutl.icn
- X#
- X# Title: indexing utilities
- X#
- X# Author: Richard L. Goerwitz
- X#
- X# Version: 1.19
- X#
- X############################################################################
- X#
- X# This file contains base_name(), dir_name(), get_index_fname(),
- X# stripchars(), abort(), and gettokens().
- X#
- X# base_name(s), dir_name(s) - like the Unix system commands
- X# create_fname(fname,ext) - get a new filename based on fname + ext
- X# stripchars(s,c) - strip chars c from string s
- X# abort(proc,msg,ecode) - abort procedure proc with exit code ecode
- X# write_int(f, int, size) - breaks int into 8-bit chunks & writes to f
- X# read_int(f, int, size) - like write_int, only constructs int from f
- X# are_metas(pattern) - succeeds if pattern has egrep-style metas
- X# digits_2_bitmap(s) - converts string 01:13:94 to an int-bitmap
- X#
- X############################################################################
- X#
- X# Links: ./findre.icn, radcon.icn, bincvt.icn
- X#
- X# See also: retrieve.icn, retrops.icn, bmp2text.icn, makeind.icn
- SHAR_EOF
- true || echo 'restore of indexutl.icn failed'
- fi
- echo 'End of part 4'
- echo 'File indexutl.icn is continued in part 5'
- echo 5 > _shar_seq_.tmp
- exit 0
- --
-
- -Richard L. Goerwitz goer%sophist@uchicago.bitnet
- goer@sophist.uchicago.edu rutgers!oddjob!gide!sophist!goer
-