home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 35 Internet
/
35-Internet.zip
/
htm2txt1.zip
/
HTM2TXT.CMD
< prev
next >
Wrap
OS/2 REXX Batch file
|
1997-03-13
|
35KB
|
867 lines
/* ----------------------------------------------------------------- */
lastmod='1997-03-11'
/* ----------------------------------------------------------------- */
/* variables to be customized */
/* */
/* following variables may be set to permanent installation */
/* specific values. they may be temporary modified */
/* by command line options */
/* option */
/* ------ */
linemax=72 /* maximum line length l n */
pixlbyt= 6 /* pixels per byte for tables p n */
editor ='E' /* editor for output file e [..] */
chain ='Y' /* follow url-chain f n */
showu ='N' /* show url link adddress u */
ofile ='.TXT' /* derive outfilename from ifile o name */
/* ----------------------------------------------------------------- */
/* 97-03-11 or accept width= in <table> tag */
/* 97-02-19 or define out-file name due to
Ralph_Ulrich@p31.lemmi.ftg.donut.de */
/* 97-02-19 or switch off chaining due to jblumel@gs.net */
/* 97-02-06 or accept missing tags </td>, </tr> */
/* 97-01-25 or substitute ... tokens, <table> correction */
/* 96-12-31 or <PRE> correction */
/* 96-12-18 or /FONT correction */
/* 96-11-17 or list of &constants due to tremro@digicom.qc.ca */
/* 96-11-07 or problem with nested tables */
/* 96-10-22 or filenames drag-drop due to sahag@ibm.net */
/* 96-10-21 or filenames with wildcard * due to sahag@ibm.net */
/* 96-10-19 or filenames with embedded blanks due to sahag@ibm.net */
/* 96-10-17 or rework width= due to lconyers@postmaster2.dot.gov */
/* 96-09-21 or rework follow href= error */
/* 96-09-11 or <a ...> tag due to etraas@te.xs4all.nl */
/* 96-08-21 or rework <table> tag error */
/* 96-08-03 or rework <table> tag due to "Kirchner Soft" */
/* 96-08-02 or rework <table> tag due to pinkas@en.com */
/* 96-07-16 or follow href= */
/* 96-06-29 or rework */
/* 96-04-17 or try to support <table> */
/* 96-01-15 or reworked */
/* 95-07-10 or decode HTML files */
/* ----------------------------------------------------------------- */
/* call: htm2txt infile [l nn [p nn [e editor [u [f n [o xxx */
/* output: infile-name.TXT */
/* */
/* recognised tags: */
/* */
/* all tags as supported by ibm webex 1.1b */
/* */
/* special tags: */
/* */
/* <trace> trace '?i' */
/* <exit> exit immediately */
/* */
/* recognised substitute variables see variable 'consts' */
/* tab-char ' ' will be ignored */
/* */
/* notes: */
/* */
/* all tags are converted as 'best fit'. */
/* the image a browser produces will not be met. */
/* */
/* --------------------------------------------------------- */
/* constants contributed by tremro@digicom.qc.ca */
/* */
consts= "space '20'x"
consts=consts "#32 '20'x"
consts=consts "quot '22'x"
consts=consts "#34 '22'x"
consts=consts "amp '00'x"
consts=consts "#38 '00'x"
consts=consts "#39 '27'x"
consts=consts "#58 '3a'x"
consts=consts "#60 <"
consts=consts "lt <"
consts=consts "#62 >"
consts=consts "gt >"
consts=consts "#91 ["
consts=consts "#93 ]"
consts=consts "nbsp '20'x"
consts=consts "#160 '20'x"
consts=consts "iexcl 'a1'x"
consts=consts "cent 'a2'x"
consts=consts "pound 'a3'x"
consts=consts "curren 'a4'x"
consts=consts "yen 'a5'x"
consts=consts "brvbar 'a6'x"
consts=consts "sect 'a7'x"
consts=consts "uml 'a8'x"
consts=consts "copy 'a9'x"
consts=consts "ordf 'aa'x"
consts=consts "laqno 'ab'x"
consts=consts "not 'ac'x"
consts=consts "shy 'ad'x"
consts=consts "reg 'ae'x"
consts=consts "hibar 'af'x"
consts=consts "deg 'b0'x"
consts=consts "plusmn 'b1'x"
consts=consts "sup2 'b2'x"
consts=consts "sup3 'b3'x"
consts=consts "acute 'b4'x"
consts=consts "micro 'b4'x"
consts=consts "para 'b6'x"
consts=consts "middot 'b7'x"
consts=consts "cedil 'b8'x"
consts=consts "sup1 'b9'x"
consts=consts "ordm 'ba'x"
consts=consts "raquo 'bb'x"
consts=consts "frac14 'bc'x"
consts=consts "frac12 'bd'x"
consts=consts "frac34 'be'x"
consts=consts "iquest 'bf'x"
consts=consts "Agrave 'c0'x"
consts=consts "Aacute 'c1'x"
consts=consts "Acirc 'c2'x"
consts=consts "Atilde 'c3'x"
consts=consts "Auml 'c4'x"
consts=consts "Aring 'c5'x"
consts=consts "AElig 'c6'x"
consts=consts "Ccedil 'c7'x"
consts=consts "Egrave 'c8'x"
consts=consts "Eacute 'c9'x"
consts=consts "Ecirc 'ca'x"
consts=consts "Euml 'cb'x"
consts=consts "Igrave 'cc'x"
consts=consts "Iacute 'cd'x"
consts=consts "Icirc 'ce'x"
consts=consts "Iuml 'cf'x"
consts=consts "ETH 'd0'x"
consts=consts "Ntilde 'd1'x"
consts=consts "Ograve 'd2'x"
consts=consts "Oacute 'd3'x"
consts=consts "Ocirc 'd4'x"
consts=consts "Otilde 'd5'x"
consts=consts "Ouml 'd6'x"
consts=consts "times 'd7'x"
consts=consts "Oslash 'd8'x"
consts=consts "Ugrave 'd9'x"
consts=consts "Uacute 'da'x"
consts=consts "Ucirc 'db'x"
consts=consts "Uuml 'dc'x"
consts=consts "Yacute 'dd'x"
consts=consts "THORN 'de'x"
consts=consts "szlig 'df'x"
consts=consts "agrave 'e0'x"
consts=consts "aacute 'e1'x"
consts=consts "acirc 'e2'x"
consts=consts "atilde 'e3'x"
consts=consts "auml 'e4'x"
consts=consts "aring 'e5'x"
consts=consts "aelig 'e6'x"
consts=consts "ccedil 'e7'x"
consts=consts "egrave 'e8'x"
consts=consts "eacute 'e9'x"
consts=consts "ecirc 'ea'x"
consts=consts "euml 'eb'x"
consts=consts "igrave 'ec'x"
consts=consts "iacute 'ed'x"
consts=consts "icirc 'ee'x"
consts=consts "iuml 'ef'x"
consts=consts "eth 'f0'x"
consts=consts "ntilde 'f1'x"
consts=consts "ograve 'f2'x"
consts=consts "oacute 'f3'x"
consts=consts "ocirc 'f4'x"
consts=consts "otilde 'f5'x"
consts=consts "ouml 'f6'x"
consts=consts "divide 'f7'x"
consts=consts "oslash 'f8'x"
consts=consts "ugrave 'f9'x"
consts=consts "uacute 'fa'x"
consts=consts "ucirc 'fb'x"
consts=consts "uuml 'fc'x"
consts=consts "yacute 'fd'x"
consts=consts "thorn 'fe'x"
consts=consts "yuml 'ff'x"
/* --------------------------------------------------------- */
/* check input parameters */
/* */
if arg(1)='' then do
say
say 'correct call is:'
say
say ' htm2txt infilename [options '
say ' infilename = fully qualified path'
say ' may contain wildcard *'
say ' options (any order)'
say ' l nn max nn chars in a line'
say ' p nn max nn pixels per byte'
say ' e editor name'
say ' o outfile name'
say ' f n do not follow url-chain'
say ' u show anchor url''s'
say
exit 4
end
parse arg arg
arg=strip(translate(arg,' ','"'))
z=pos('.',arg)
if z=0 then do
parse var arg ifiname options
ifiname=ifiname'.HTM'
end
else do
parse var arg ifiname '.' ifext options
ifiname=ifiname'.'ifext
end
/* --------------------------------------------------------- */
/* check for wildcard char in ifiname */
/* */
wcd.0=1; wcd.1=ifiname
swi_wcd = pos('*',ifiname)>0
if swi_wcd then do
if \RxFuncAdd('SysLoadFuncs','RexxUtil','SysLoadFuncs') then call 'SysLoadFuncs'
call SysFileTree ifiname,'WCD.','FO'
end
/* --------------------------------------------------------- */
/* check for options */
/* */
swi_url=0
swi_dbg=0
do while options \= ''
parse upper var options opt val options
select
when opt='L' then linemax=val
when opt='P' then pixlbyt=val
when opt='E' then editor =val
when opt='F' then chain =val
when opt='O' then ofile =val
when opt='U' then do; showu ='Y'; options=val options; end
when opt='D' then do; swi_dbg=1; options=val options; end
otherwise nop
end
end
if translate(showu)='N' then swi_url=0; else swi_url=1
if translate(chain)='Y' then swi_chn=1; else swi_chn=0
/* --------------------------------------------------------- */
/* activate debug facilities */
/* */
if swi_dbg then do
say 'debug active'
signal on syntax
signal on error
signal on failure
signal on halt
end
/* --------------------------------------------------------- */
/* delete output file */
/* */
if ofile='.TXT' then do
parse var ifiname ofiname '.' .
ofiname=ofiname'.TXT'
end
else ofiname=ofile
if swi_dbg then signal off error
'erase' '"'ofiname'"' '2>NUL'
if swi_dbg then signal on error
/* --------------------------------------------------------- */
/* loop on file-list */
/* */
call time 'r'
do wcd = 1 to wcd.0
ifiname=wcd.wcd
/* --------------------------------------------------------- */
/* some global controls */
/* */
hrf.0=1 /* href-control */
hrf.1=ifiname
call lineout ofiname,'HTM2TXT v.' lastmod
call lineout ofiname,' '
call lineout ofiname,'Extracted from' ifiname',' date()',' left(time(),5)
call lineout ofiname,' '
do nexthrf=1 while hrf.0>=nexthrf
call process_file hrf.nexthrf
end
/* --------------------------------------------------------- */
end /* end wildcard loop */
call lineout ofiname
/* --------------------------------------------------------- */
/* edit result */
swi_edt=(editor\='')
if swi_wcd then if wcd>1 then swi_edt=0
if swi_edt then 'start /F' editor '"'ofiname'"'
/* --------------------------------------------------------- */
laps=time('e')%1
min=laps%60
sec=laps//60
say 'finished' min':'right(sec,2,0) 'min'
exit 0
/* --------------------------------------------------------- */
/* process a file */
/* */
process_file: parse arg ifiname
/* --------------------------------------------------------- */
/* read infile */
/* */
nl ='0d'x /* new line character */
ifi=''
say
say 'reading' ifiname
/* --------------------------------------------------------- */
/* mod due to Ralph_Ulrich@p31.lemmi.ftg.donut.de */
/* */
call charin ifiname,1,0
ifiname_LEN = chars(ifiname)
ifi = charin( ifiname,1,ifiname_LEN)
call stream ifiname,'c','close'
say ifiname_LEN 'Bytes read from' ifiname
ifi=translate(ifi,' ','090A'x)
/* --------------------------------------------------------- */
/* format outfile lines */
/* */
ofi.0=0 /* out file controls */
dlspaces ='' /* <DL>-spaces */
lispaces ='' /* <LI>-spaces */
indents =0 /* number of indents */
blanklines=0 /* number of blank lines */
linelen =linemax /* max. linelength */
outtext ='' /* initial text */
/* switches: */
swi_pre = 0 /* switch PRE */
swi_tbl = 0 /* switch table */
swi_tr = 0 /* switch table row active */
swi_td = 0 /* switch def/hdr active */
swi_wid = 1 /* switch calc.col width */
swi_lst = 0 /* switch list definition */
swi_cnt = 0 /* switch center text */
swi_cat = 0 /* switch concatenate */
swi_trc = 0 /* switch trace */
cnt_tbl = 0 /* count nested tables */
wid_tbl = 0 /* width table width */
/* --------------------------------------------------------- */
/* scan input stream */
/* */
call charout ,'processing token '
text=''
count=0
do while length(ifi)>0
if swi_trc then do; interpret 'trace' tracetag; swi_dbg=1; end
/* check next line */
parse var ifi parttext '<' tag '>' ifi
if pos('<',tag)>0 then do
parse var tag tag '<' rest
ifi='<'rest'>'ifi
end
/* process text */
select
when swi_pre then call process_preformatted
when strip(parttext)=nl then nop
otherwise do
do while pos(nl,parttext)>0
parse var parttext a (nl) b
parttext=strip(a) strip(b)
end
if swi_cat then text=text||parttext
else do
if text='' then text= parttext
else text=text parttext
end
end
end
/* process tag */
tag=translate(tag,' ',nl)
if left(tag,1)='!' then tag='!' substr(tag,2)
parse var tag tag options
tag=translate(tag)
if tag='TRACE' then do
swi_trc=1
if pos('?',options)>0 then tracetag='?i'
else tracetag=' i'
end
swi_cat=0
count=count+1
if \swi_dbg then call charout , copies('08'x,6)||format(count,5)' '
select
when swi_tbl then do
select
when tag='TR' then do
if swi_td then call save_table_text
if swi_tr then call end_row
tabcol=0
drop tbtxt.
end
when tag='TD' ,
| tag='TH' then do
if swi_td then call save_table_text
swi_tr=1
swi_td=1
/* determine next column */
z=parmval('COLSTART',options)
if z=0 then tabcol=tabcol+1
else tabcol=z
if colmax<tabcol then do
colmax=tabcol
swi_wid=1
end
/* check for width= tag */
p=parmval('WIDTH',options)
if p>0 & \datatype(p,'NUM') then do
z=verify(p,'1234567890'); n=0
if z>0 then do
n=substr(p,z,1)
q=left(p,z-1)
end
select
when n='P' then do
tbwid.tabcol.0=q%pixlbyt
end
when n='%' then do
tbwid.tabcol.0=(q*linelen)%100
end
otherwise
if wid_tbl>0 then q=trunc(q/wid_tbl*100)
if q>linemax then q=linemax
tbwid.tabcol.0=q
end
end
if p>0 & datatype(p,'NUM') then do
if wid_tbl>0 then p=trunc(p/wid_tbl*100)
tbwid.tabcol.0=p
end
end
when tag='/TD' ,
| tag='/TH' then call save_table_text
when tag='/TR' then call end_row
when tag='TABLE' then do
if swi_td then call save_table_text
if swi_tr then call end_row
cnt_tbl=cnt_tbl+1
end
when tag='/TABLE' then do
if swi_td then call save_table_text
if swi_tr then call end_row
blanklines=0
call out ' '
cnt_tbl=cnt_tbl-1
swi_tbl=(cnt_tbl>0)
if \swi_tbl then do
wid_tbl=0
end
end
when tag='BR' then do
if colmax>1 then call save_table_text
else call out text
end
otherwise nop
end
end
when tag='TABLE' then do
call out text
blanklines=0
call out ' '
swi_tbl =1
swi_wid =1
cnt_tbl =cnt_tbl+1
swi_cnt =0
tbwid. =0
tblin. =0
tabcol =0
colmax =0
wid_tbl =parmval('WIDTH',options)
if \datatype(wid_tbl,'NUM') then wid_tbl=0
end
when tag='EXIT' then signal finish
when tag='!' then call out '***' options '***'
when tag='FONT',
| tag='/FONT' then swi_cat=1
when tag='UL',
| tag='OL',
| tag='DL',
| tag='DIR',
| tag='MENU',
then do
call out text
call out ' '
if lispaces='' then lispaces=' * '
else lispaces=' 'lispaces
indents=indents+1
swi_lst=1
end
when tag='LI' then call out text
when tag='DT' then do
call out text
dlspaces=' '
end
when tag='DD' then do
call out text
dlspaces=' '
end
when tag='/UL',
| tag='/OL',
| tag='/DL',
| tag='/DIR',
| tag='/MENU',
then do
call out text
dlspaces=''
lispaces=substr(lispaces,4)
if indents>0 then indents=indents-1
call out ' '
swi_lst=0
end
when tag='CENTER',
| tag='CENTRE',
then do
swi_cnt=1
end
when tag='/CENTER',
| tag='/CENTRE',
then do
swi_cnt=0
call out text
end
when tag='P',
| tag='/TITLE',
then call out text
when tag='/HEAD',
then do
call out text
call out ' '
end
when tag='PRE' then do
swi_pre=1
linelen=parmval('WIDTH',options)
end
when tag='/PRE' then do
swi_pre=0
linelen=linemax
end
when tag='HR' then do
call out text
call out copies('-',linelen)
end
when tag='H1',
| tag='H2',
| tag='H3',
| tag='H4',
| tag='/H1',
| tag='/H2',
| tag='/H3',
| tag='/H4',
| tag='/CAPTION',
then do
call out text
call out ' '
end
when tag='A' then do
parse upper var options 'HREF' . '"' hrefid '"'
nogo= pos('#',hrefid)>0
srefid=''
if swi_url,
& \nogo then do
srefid=hrefid
end
parse var hrefid z '.' fext
nogo=nogo|(left(fext,3)\='HTM')
parse var hrefid z 'FILE:' hrefid
if hrefid='' then hrefid=z
nogo=nogo|(strip(hrefid)='')
do i=1 to hrf.0
if hrf.i=hrefid then leave
end
if (i>hrf.0)&(\nogo)&(swi_chn) then do
hrf.0=hrf.0+1; z=hrf.0; hrf.z=hrefid
end
end
when tag='/A' then do
if swi_url,
& srefid\='' then do
text=text '('srefid')'
srefid=''
end
end
/*
when tag='IMG' then do
z=parmval('ALT',options)
if z\=0 then do
if swi_tbl then do
text=z
call save_table_text
end
else text=text z
end
end
*/
when tag='BR' then call out text
otherwise nop
end
/* all finished */
end
/* --------------------------------------------------------- */
/* write outfile */
/* */
finish:
do i=1 to ofi.0
call lineout ofiname,ofi.i
end
return
/* ========================================================= */
/* --------------------------------------------------------- */
/* close table row */
end_row:
swi_tr=0
swi_td=0
/* col-width already done ? */
if swi_wid then do
/* check predefined col-width */
colwi=0
do i=1 to colmax
if tbwid.i.0>0 then tbwid.i=tbwid.i.0
else tbwid.i=0
colwi=colwi+tbwid.i
end
linelen=linemax-colwi
if linelen<=0 then linelen=linemax
/* set col-width if not set */
do i=1 to colmax
if tbwid.i>0 then iterate
tbwid.i=linelen%colmax
end
linelen=linemax
/* check sum colwid exceeds */
sum_col=0
do i=1 to colmax
sum_col=sum_col+tbwid.i
end
if sum_col>linemax then do
ratio=linemax/sum_col
do i=1 to colmax
tbwid.i=trunc(tbwid.i/ratio)
end
end
end
swi_wid=0
/* get max nr. lines in row */
linmax=1
do i=1 to colmax
if linmax<tblin.i then linmax=tblin.i
end
/* fill uninitlzd variables */
do y=1 to linmax
do k=1 to colmax
tbtxt.k.y=subs(tbtxt.k.y)
if left(tbtxt.k.y,6)\='TBTXT.' then iterate
if k=1 then tbtxt.k.y='_'
else tbtxt.k.y=''
end
end
/* scan all lines all cols */
do y=1 to linmax
anytxt=0
do k=1 to colmax
if strip(tbtxt.k.y)='' then iterate
anytxt=1
leave
end
do while anytxt
anytxt=0
do k=1 to colmax
/* check length fits */
if length(tbtxt.k.y)>tbwid.k ,
& tbwid.k>0 then do
z=lastpos(' ',tbtxt.k.y,tbwid.k)
if z=0 then z=tbwid.k
otext=left(tbtxt.k.y,z) /* split text */
tbtxt.k.y=strip(substr(tbtxt.k.y,z))
anytxt=1
end
else do
otext=tbtxt.k.y
tbtxt.k.y=''
end
if tbtxt.1.y='' then tbtxt.1.y='_'
/* build output line */
text=text left(otext,tbwid.k)
end
/* all cols processed */
call out_table_text
end
end
tblin.=0
return
/* --------------------------------------------------------- */
/* save table-text */
/* */
save_table_text:
swi_td=0
if strip(text)\='' then do
tblin.tabcol=tblin.tabcol+1
z=tblin.tabcol
tbtxt.tabcol.z=text
end
text=''
return
/* --------------------------------------------------------- */
/* out table-text */
/* */
out_table_text:
text = strip(text)
if text ='' then return
if text \= '_' then call o text
text = ''
return
/* --------------------------------------------------------- */
/* process preformatted */
/* */
process_preformatted:
do while length(parttext)>0
parse var parttext outtext (nl) parttext
oli=subs(outtext)
ofi.0=ofi.0+1; z=ofi.0; ofi.z=outtext
end
return
/* --------------------------------------------------------- */
/* extract parameter values */
/* */
parmval: procedure; parse upper arg key,string
z=pos(key,string)
if z=0 then return 0
string=substr(string,z)
parse var string '=' val .
val=translate(val,' ','"')
val=translate(strip(val))
return val
/* --------------------------------------------------------- */
/* do output lines */
/* */
out:
oli=subs(arg(1))
oll=length(oli)
/* do not output more than 1 blank line */
if oll=0 then do
if blanklines>0 then return
blanklines=blanklines+1
end
if linelen>0 then do
do while oll>linelen
z=lastpos(' ',oli,linelen)
if z=0 then z=oll
if (z>0) then do
call o left(oli,z)
oli=strip(substr(oli,z+1))
oll=length(oli)
end
end
end
call o oli
if oll>0 then blanklines=0
text=''
return
o: procedure expose swi_cnt linelen indents dlspaces lispaces ofi.
parse arg ooo
if swi_cnt then do
z=(linelen-length(ooo))%2
if z>0 then prefix=copies(' ',z)
else prefix=''
end
else do
prefix=copies(' ',indents)||lispaces||dlspaces
end
ofi.0=ofi.0+1; z=ofi.0; ofi.z=prefix||ooo
return
/* --------------------------------------------------------- */
/* substitute constants */
/* */
subs: procedure expose consts count
l=arg(1)
/* check for tab chars */
l=translate(l,' ','09'x)
/* check for variables */
z=pos('&',l)
do while z > 0
parse var l head '&' token ';' tail
w=wordpos(token,consts)
if w=0 then do
if (left(token,1)='#')&(datatype(token,'NUM')) then do
token=substr(token,2)
token=d2c(token)
end
else do
token='?'token';'
end
end
else do
token=word(consts,w+1)
if right(token,2)="'x" then interpret "token="token
end
l=head||token||tail
z=pos('&',l)
end
return strip(translate(l,'&','00'x))
/* --------------------------------------------------------- */
syntax:
say 'signal on syntax in' sigl':' strip(sourceline(sigl))
signal common_error
error:
say 'signal on error in' sigl':' strip(sourceline(sigl))
signal common_error
failure:
say 'signal on failure in' sigl':' strip(sourceline(sigl))
signal common_error
halt:
say 'signal on halt in' sigl':' strip(sourceline(sigl))
signal common_error
common_error:
trace '?i'
do forever
nop
end
/* --------------------------------------------------------- */