home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 10 Tools
/
10-Tools.zip
/
OL.LZH
/
PROCS.LZH
/
SENTENCE.ICN
< prev
next >
Wrap
Text File
|
1991-07-13
|
5KB
|
153 lines
############################################################################
#
# Name: sentence.icn
#
# Title: generate sentences in file
#
# Author: Richard L. Goerwitz
#
# Date: June 3, 1991
#
# Version: 1.2
#
############################################################################
#
# sentence(f) - suspends sentences from file f
#
# A lot of grammatical and stylistic analysis programs are predicated
# on the notion of a sentence. For instance, some programs count the
# number of words in each sentence. Other count the number and length
# of clauses. Still others pedantically check for sentence-final par-
# ticles and prepositions.
#
# This procedure, sentence(), is supposed to be used as a filter for
# ASCII text files, suspending everything that looks remotely like a
# sentence in them.
#
# BUGS: Cannot correctly parse sentences with constructs like "R. L.
# Goerwitz" in them. The algorithm can be much improved simply by
# checking to see if the word after the period is in /usr/dict/words
# or whatever your system dictionary file is. If it isn't, then it's
# likely not to be the beginning of a sentence (this also is not in-
# fallible, naturally).
#
############################################################################
#
# Requires: co-expressions
#
############################################################################
procedure sentence(intext)
local sentence, get_line, line, tmp_s, end_part, whole_thing
static inits, punct
initial {
inits := &ucase ++ &digits
punct := ".\"'!?)]"
}
sentence := ""
get_line := create read_line(intext)
while line := @get_line do {
# If we hit a blank line, it's a signal from read_line that we
# have encountered a change in the indentation level, and
# should call it a sentence break (though it could just be
# indentation for a quote, a section header, etc., it seems
# these all indicate major, sentence-like divisions in the
# text).
if line == "" then {
suspend sentence
sentence := ""
next
}
# Go on until you can't find any more sentence-endings in line,
# then break and get another line.
repeat {
# Scan for a sentence break somewhere in line.
line ? {
# Ugly, but it works. Look for sequences containing
# things like periods and question marks, followed by
# a space and another space or a word beginning with
# a capital letter. If we don't have enough context,
# append the next line from intext to line & scan again.
if tmp_s := tab(upto(punct)) &
upto('!?.', end_part := tab(many(punct))) &
not (pos(-1), line ||:= @get_line, next) &
=" " & (=" " | (tab(many('\'"('))|&null,any(inits)))
# IF YOU WANT TO ADD A DICTIONARY CHECK, then read in
# a dictionary like /usr/dict/words, and then change
# any(inits) above to something like (any(inits),
# longstr(list_of_usrdictwords,map(&subject),&pos), =" ")
# where longstr() matches each string in list_of_usr-
# dictwords.
then {
# Don't bother with little two-letter hunks.
whole_thing := sentence || tmp_s || end_part
if *whole_thing > 3 | find(" ",whole_thing)
then suspend whole_thing
tab(many(' '))
line := tab(0)
sentence := ""
next
}
else break
}
}
# Otherwise just tack line onto sentence & try again.
sentence ||:= line
}
return sentence
end
procedure read_line(intext)
local new_line, ilevel, junk_count, space_count, line
static last_ilevel, blank_flag
last_ilevel := 0
while line := trim(!intext,'\t ') do {
# Check to see if line is blank; if so, set blank_flag.
if line == "" then
{ blank_flag := 1; next }
# Determine current indentation level.
detab(line) ? {
ilevel := *tab(many(' ')) | 0
}
line ? {
tab(many('\t '))
# Signal the calling procedure if there is a change in the
# indentation level by suspending a blank line.
if (ilevel > last_ilevel) | (ilevel < last_ilevel, \blank_flag)
then suspend ""
last_ilevel := ilevel
# Put a space on the end of line, unless it ends in a dash.
new_line := tab(-1) || (="-" | (move(1) || " "))
# Make sure the flag that indicates blank lines is unset.
blank_flag := &null
}
# Suspend the newly reformatted, trimmed, space-terminated line.
suspend new_line
}
end