The Mac 13

home *** CD-ROM | disk | FTP | other *** search

/ The Mac 13 / the-mac-13.iso / Shareware City / Utilities / setext2html / setext2html.pl < prev

Wrap

Perl Script | 1995-06-05 | 16.8 KB | 774 lines | [TEXT/R*ch]

#!/usr/bin/perl eval "exec /usr/bin/perl -S $0 $*" if $running_under_some_shell; # # Copyright Hans Verbrugge 1995 # hans@solair1.inter.nl.net # # $Id: setext2html.pl,v 1.4 1995/06/04 13:20:39 hans Exp hans $ # # this script translates setext formatted files # to html formatted files # # # # YOU, THE USER OF THIS SOFTWARE WILL USE THIS SOFTWARE AT YOUR OWN RISK. # IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES. # # based on setext.pl,v 2.8 # by Tony Sanders <sanders@earth.com> # #_______ NOT SUPPORTED (YET) _______ # digests # multisetext # href_tt # hot_tt, will only be diplayed as strong emphasized. #_______ TODO ________ # be more economical on memory usage #_______ HISTORY ________ ###### # v1.0, Hans Verbrugge # - initial version, never made it public # ###### # v1.1b1, Hans Verbrugge # - implemented chapter (on title_tt) splitting ###### # v1.1b2, Hans Verbrugge # - when splitting chapters , overview file gets the title of the # first chapter + "Overview" # - when splitting chapters, prefixes for output files # and the name of the index file # are userdefined ###### # v1.1b3, Hans Verbrugge # - made preparations to make this run under Unix ###### # v1.1b4, Hans Verbrugge # - url's are translated properly ###### # v1.1b5, Hans Verbrugge # - url's are not properly translated when it contains a ':portnumber'. # fixed # - handle url's first, translate setext special chars # like '_' into escape sequences so that further parsing # leaves the url alone ###### # v1.1, Robert A. Lentz <lentz@annie.astro.nwu.edu> # Hans Verbrugge # - fixed argument parsing # - fixed chdir bug ###### # v1.2, Hans Verbrugge # - my perl version for linux (v5.000) goes bananas # when feeded large regular expression substitutions. # Those are now splitted into seperate subroutines. # (DoUrl/SubstUrl/DoMail/SubstMail) ###### # v1.3, Hans Verbrugge # - when no files are specified on commandline # read from stdin ###### # v1.4, Hans Verbrugge # - implemented quote-tt handling #_______ GLOBALS _______ # comment if you're not running this on a mac $MACINTOSH=1; #_______ OPTIONS _______ if (defined ($MACINTOSH)) { require "StandardFile.pl"; # CONFIGURABLE PARAMETERS FOR MAC USERS # set to '0' if you don't want your chapters # stored into separate files $SPLIT_ON_CHAPTERS=0; # set to '0' if you don't want # mail addresses to be translated to the MAILTO: url $DO_MAIL = 1; # set to '0' if you don't want # url's to be translated. $DO_URL = 1; # set to '0' if you don't care what's going on :-) $VERBOSE = 1; # when splitting on chapters, this becomes the overview file, # rename it if you want another name $INDEX_FILE_PREFIX="index"; # when splitting on chapters, this becomes the prefix name of # the output file, so in this case chapter 1 gets stored in ch-1.html # rename it if you want another name $CHAPTER_PREFIX="ch-"; } else { # default settings for unix $SPLIT_ON_CHAPTERS=0; $DO_MAIL = 0; $DO_URL = 0; $VERBOSE = 0; $INDEX_FILE_PREFIX="index"; $CHAPTER_PREFIX="ch-"; #get current directory $DIR = `pwd`; chop ($DIR); } $SECTION=0; $CHAPTER_NO=0; $title=""; $firstchapter=""; $stylechange=""; #_______ arrays _______ %FILES=(); %SPLITTED_CHAPTERS=(); $lines=""; $hrefs=""; #_______ states _______ $INDENT = 1; $MONO =2; $PREFORMAT = 3; $STATE = $INDENT; $UNFOLDING = 0; # html styles $html_addr= "ADDRES"; $html= "HTML"; $html_head= "HEAD"; $html_body= "BODY"; $html_title= "TITLE"; $html_bold= "B"; $html_italic= "I"; $html_uline= "EM"; $html_emph= "EM"; $html_strongemph= "STRONG"; $html_block= "BLOCKQUOTE"; $html_preformat= "PRE"; $html_paragraphbreak= "P"; #_______ setext styles _______ $title_tt= "^(={2,})\\s*\$"; $subhead_tt= "^(-{2,})\\s*\$"; $indent_tt= "^ "; $bold_tt= "\\*\\*([^\\*]*)\\*\\*"; $italic_tt= "~([^\\s]*)~"; $underline_tt= "_([^\\s]*)_"; $hot_tt= "\\b([^\\s]*)_\\b"; $include_tt= "^>\\s*(.*)"; $bullet_tt= "^\\*\\s+(.*)"; $quote_tt= "`_(\\b+)_`"; #### MAIN #### while ($ARGV[0] =~ /^[^-].*/) { $_ = shift @ARGV; $FILES{$_}=""; } if (! defined ($MACINTOSH)) { &parseopts(); } # on a mac ask for input if no files were found in @ARGV if (! %FILES && defined ($MACINTOSH)) { $ifile = &StandardFile'GetFile("open source file:", "TEXT", "TEXT"); $ifile eq '' && &Fail ("no input file given!\n"); $FILES{$ifile} = "bla"; } # on a mac set stderr unbuffered if we are in verbose mode if ($VERBOSE == 1 && defined ($MACINTOSH)) { select (stderr); $| = 1; select (stderr); } # on a mac let user choose a folder to save splitted files, # on unixbox directory was given on the commandline if ($SPLIT_ON_CHAPTERS == 1) { if (defined $MACINTOSH) { $DIR = &MacChooseOuput(); } # split the chapters in directory $DIR &SplitChapters($DIR); # now change directory to $DIR chdir ($DIR) || &Fail ("cd $DIR: $!\n"); } else { # force a loop when reading from stdin if (! %FILES) {$FILES{"USE SINPUT"} = "";} } print STDERR "parsing ...\n" if $VERBOSE; foreach $file (sort keys %FILES) { local ($ofile) = $FILES{$file}; if (defined $MACINTOSH && ! $SPLIT_ON_CHAPTERS) { $ofile = &MacChooseOuput($file); } &do_it ($file, $ofile); } if ($SPLIT_ON_CHAPTERS) { foreach $file (sort keys %FILES) { unlink $file; } &BuildContents(); } print STDERR "done\n" if $VERBOSE; exit 0; ############## sub parseopts { while (@ARGV) { $_ = shift (@ARGV); if (/^-v$/) {$VERBOSE = 1;} elsif (/^-m$/) {$DO_MAIL = 1;} elsif (/^-u$/) {$DO_URL = 1;} elsif (/^-s$/) {$SPLIT_ON_CHAPTERS = 1;} elsif (/^-d(.*)/) {$DIR = $1 ? $1 : shift(@ARGV);} elsif (/^-i(.*)/) {$INDEX_FILE_PREFIX = $1 ? $1 : shift(@ARGV);} elsif (/^-p(.*)/) {$CHAPTER_PREFIX = $1 ? $1 : shift(@ARGV);} elsif (/^-(.)/) { print STDERR "unrecognized option: $_\n"; &Usage; exit (1); } else { print STDERR "unrecognized token: $_\n"; &Usage(); exit (1); } } } # build the contents file when splitting on chapters sub BuildContents { print STDERR "building ${INDEX_FILE_PREFIX}.html...\n" if $VERBOSE; open (CONTENTS,">${INDEX_FILE_PREFIX}.html") || &Fail ("${INDEX_FILE_PREFIX}.html:$!\n"); print CONTENTS <<"EOF"; <${html}> <${html_head}> <${html_title}>${firstchapter} Overview</${html_title}> </${html_head}> <${html_body}> <H2><CENTER>${firstchapter} Overview</CENTER></H2><P> <HR> <UL> EOF # translate funny chars back to html grep (do {&XlateSpecialChars} && 0,%SPLITTED_CHAPTERS); foreach $key (sort bynumber keys %SPLITTED_CHAPTERS) { $file = ${CHAPTER_PREFIX} . ${key}; print CONTENTS " <LI><A HREF=\"${file}.html\">$SPLITTED_CHAPTERS{$key}</A>\n"; } print CONTENTS <<"EOF"; </UL> <P> <HR> </${html_body}> </${html}> EOF } # do_it; parse input ifile and write to output file ofile sub do_it { local ($ifile,$ofile) = @_; local ($i); local ($key); if ($ifile eq "USE SINPUT") { open (IFILE, "<&STDIN") || &Fail ("couldn't dup stdin:$!\n"); } else { open (IFILE,$ifile) || &Fail ("$file:$!\n"); } # fill lines array; $lines = ""; $#lines=0; while (<IFILE>) { chop; # eat white $_=~s/\s+$//go; $lines[$.-1]=$_; } close (IFILE); if ($ofile eq '') { open (OUTFILE, ">&STDOUT") || &Fail ("couldn't dup stdout:$!\n"); } else { open (OUTFILE,">${ofile}") || &Fail ("Couldn't create ${ofile}:$!\n"); } # translate special characters in # their html counterparts grep (do {&TranslateSpecialChars} && 0,@lines); # process subheads & titles &HandleSubheads; print " $title...\n" if $VERBOSE; # translate funny chars back to html grep (do {&TranslateSpecialCharsBack} && 0,@hrefs); # build the header &MakeHeader; print OUTFILE "<${html_body}>\n"; print OUTFILE "<H2><CENTER>$title</CENTER></H2><P>\n"; print OUTFILE "<HR>"; print OUTFILE "<UL>\n"; for ($i = 0; $i <= $#hrefs; $i++) { print OUTFILE " <LI>$hrefs[$i]\n"; } print OUTFILE "</UL>\n"; print OUTFILE "<P>"; #print OUTFILE "<HR>"; &ParseSetext; # translate special characters in # their html counterparts #&TranslateSpecialCharsBack; grep (do {&TranslateSpecialCharsBack} && 0,@lines); for ($i = 0; $i <= $#lines; $i++) { $_ = $lines[$i]; print OUTFILE "$_\n" unless /^\.\. /; } print OUTFILE "</${html_body}>\n"; close (OUTFILE); } sub CheckForSetext { local ($file) = @_; open (IFILE,$file) || &Fail ("Couldn't open $file:\n$!"); while (<IFILE>) { if (/^==*/ || /^--*/) { close (IFILE); return 1; } } return 0; } sub Fail { local ($why)=@_; if (defined ($MACINTOSH)) { &MacPerl'Answer ("$why", "Bye"); exit; } else { die "$why"; } } # open the input file, if script runs on a Mac # ask the user where to save the new file # otherwise print open STDOUT sub MacChooseOuput { local ($ifile) = @_; if (defined ($MACINTOSH)) { local ($file,$ofile); if ($SPLIT_ON_CHAPTERS != 1) { $file = $1 if $ifile =~ /([^:]*)$/; # basename $ofile = &StandardFile'PutFile("Save htmlized version as:", "$file.html"); exit unless $ofile; return $ofile; } else { local ($folder) = &StandardFile'GetFolder("save chapters where?"); exit unless $folder; return $folder; } } } # translate special characters to a nonprintable char # so we can recognize it later. # primary use is to *not* screw up HREFS in chapter overviews sub XlateSpecialChars { s/&/&/go; s/</</go; s/>/>/go unless /^>/; s/"/"/go; s/ö/ö/go; s/ñ/ñ/go; s/È/È/go; } sub TranslateSpecialChars { s/&/\01/go; s/</\02/go; s/>/\03/go unless /^>/; s/"/\04/go; s/ö/\05/go; s/ñ/\06/go; s/È/\07/go; } sub TranslateSpecialCharsBack { s/\01/&/go; s/\02/</go; s/\03/>/go; s/\04/"/go; s/\05/ö/go; s/\06/ñ/go; s/\07/È/go; } sub SplitChapters { local ($dir) = @_; local ($file) = ""; local ($i,$_); local ($firstLineIsChapter); local ($currentdir) = `pwd`; chop ($currentdir); print STDERR "splitting...\n" if $VERBOSE; # force a loop when no input files where given if (! %FILES) {$FILES{"USE SINPUT"} = "";} foreach $file (sort keys %FILES) { if ($file eq "USE SINPUT") { open (IFILE, "&STDIN") || &Fail ("couldn't dup stdin:$!\n"); } else { open (IFILE,$file) || &Fail ("$file:$!\n"); } chdir $dir || &Fail ("cd $dir:$!\n"); # fill lines array; while (<IFILE>) { chop; # eat white $_=~s/\s+$//go; $lines[$.-1]=$_; } close (IFILE); $firstLineIsChapter = 0; for ($i = 0; $i <= $#lines; $i++) { $_ = $lines[$i]; # check for title_tt if (/$title_tt/) { if (length ($_) == length ($lines[$i-1])) { # got a title_tt $i == 1 && ($firstLineIsChapter = 1); (($i > 1) && ($firstLineIsChapter == 0)) && &Fail("$file: when splitting on chapters,\nfirst line in the file should be title_tt\n"); $firstchapter = $lines[$i-1] unless $firstchapter; $lines[$i-1] = "\033\036" . $lines[$i-1]; print STDERR " ", substr ($lines[$i-1],2), "\n" if $VERBOSE; } } } for ($i = 0; $i <= $#lines; $i++) { $_ = $lines[$i]; if (s#^\033\036(.*)#$1#go) { close (CHAPTER) if ($CHAPTER_NO++ > 1); local ($splittedfile) = "${CHAPTER_PREFIX}${CHAPTER_NO}"; open (CHAPTER,">${splittedfile}") || &Fail ("${splittedfile}:$!\n"); $SPLITTED_CHAPTERS{"$CHAPTER_NO"}=$_; } print CHAPTER $_,"\n"; } close (CHAPTER); # back to working direcory chdir($currentdir); } # rebuild %FILES array %FILES=(); foreach $key (sort bynumber keys %SPLITTED_CHAPTERS) { $file = ${CHAPTER_PREFIX} . ${key}; $FILES{$file} = "${file}.html" } } # handle subheads & titles; # the lines are stripped from trailing whitespace, # if a line is followed by a line containing a equal # number of '=' or '-' starting at position 1, # then that line is a valid title_tt or subhead_tt. sub HandleSubheads { local ($issetext)=0; local ($y)=0; local ($i,$_); $title=""; $#hrefs=0; $SECTION = 0; for ($i = 0; $i <= $#lines; $i++) { $_ = $lines[$i]; # check for title_tt if (/$title_tt/) { if (length ($_) == length ($lines[$i-1])) { $SECTION++; $issetext = 1; $title = $lines[$i-1] unless $title ne ""; $hrefs[$y++]="<A HREF=\"\#sub${SECTION}\">$lines[$i-1]</A>"; $lines[$i-1]=~s#(.*)# <HR>\n <H2><A NAME=\"sub${SECTION}\">$1</A></H2>#; $lines[$i] = '.. '; next; } } # check for subhead_tt elsif (/$subhead_tt/) { if (length ($_) == length ($lines[$i-1])) { $SECTION++; $issetext = 1; $hrefs[$y++]="<A HREF=\"\#sub${SECTION}\">$lines[$i-1]</A>"; $lines[$i-1]=~s#(.*)# <HR>\n <H3><A NAME=\"sub${SECTION}\">$1</A></H3>#; $lines[$i] = '.. '; next; } } } # if we don't have a setext file, die! &Fail ("not a setext file!\n") unless $issetext == 1; } sub MakeHeader { print OUTFILE "<${html_head}>\n"; &PrintHtmlStyle ($html_title, $title) unless ($title eq ""); print OUTFILE "</${html_head}>\n"; } sub ParseSetext { local ($i, $_); for ($i = 0; $i <= $#lines; $i++) { $_ = $lines[$i]; next if (/^\.\. /); if ($STATE == $INDENT && /^$/) { $lines[$i] .= "\n<${html_paragraphbreak}>\n" unless $UNFOLDING++; #print OUTFILE "<${html_paragraphbreak}>\n" unless $UNFOLDING++; next; } $UNFOLDING = 0; # state transitions if (/^>\s/) { &to_mono; } elsif (/^ [^ ]/) { &to_fmt; } else { &to_pre; } s/^>\s*//; # fix quote-tt s/^ ([^ ])/\1/; # fix indent-tt # quote-tt &DoQuote(); # urls $DO_URL == 1 && &DoUrl(); # mail adresses $DO_MAIL == 1 && &DoMail(); # bullet_tt s#^\*\s+(.*)#<UL><LI>$1</UL>#o; # bold_tt s#\*\*([^\*]*)\*\*#<$html_bold>$1</$html_bold>#go; # italic_tt s#~([^\s]*)~#<$html_italic>$1</$html_italic>#go; # underline_tt s#_([^\s]*)_#($a = $1) =~ s,_, ,g; "\<$html_uline\>$a\<\/$html_uline\>"; #ge; # hot_tt s#\b([^\s]*)_\b#<$html_strongemph>$1</$html_strongemph>#go; s,\0331,_,go; s,\0332,~,go; s,\0333,",go; s,\0334,*,go; $lines[$i] = join ('',$stylechange,$_); } } sub DoUrl { s#(ftp|http|wais|telnet|gopher|news)(://[\w-\+]+)(([\.~][\w-\+/\*:\?_\#]+){0,})#&SubstUrl();#ge; } sub SubstUrl { local ($a) = join ('',$1,$2,$3); $a =~ s,_,\0331,go; $a =~ s,~,\0332,go; return "<A HREF=\0333$a\0333>$a</A>"; } sub DoMail { s#([\w-\+.!]+[@!][\w-\+]+)((\.[\w-\+]+){0,})#&SubstMail();#ge; } sub SubstMail { local ($a) = join ('',$1,$2); $a =~ s,_,\0331,go; $a =~ s,~,\0332,go; return "<A HREF=\0333mailto:$a\0333>$a</A>"; } sub DoQuote { # quoted typotags should be passed unmodified, # change typetags special chars to escape sequences s#'_(.*)!_'#&SubstQuote();#ge; } sub SubstQuote { local ($a) = $1; $a =~ s,_,\0331,go; $a =~ s,~,\0332,go; $a =~ s,\*,\0334,go; return "$a"; } sub PrintHtmlStyle { local ($style, $what) = @_; print OUTFILE "\<$style\>$what\<\/$style\>\n"; } sub to_fmt { $STATE == $INDENT && do {$stylechange = ""; return}; $stylechange = "</PRE>\n" if $STATE == $PREFORMAT; $stylechange = "</PRE></BLOCKQUOTE>\n" if $STATE == $MONO; $STATE = $INDENT; } sub to_pre { $STATE == $PREFORMAT && do {$stylechange = ""; return}; $stylechange = "<PRE>\n" if $STATE == $INDENT; $stylechange = "</PRE></BLOCKQUOTE><PRE>\n" if $STATE == $MONO; $STATE = $PREFORMAT; } sub to_mono { $STATE == $MONO && do {$stylechange = ""; return}; $stylechange = "<BLOCKQUOTE><PRE>\n" if $STATE == $INDENT; $stylechange = "</PRE><BLOCKQUOTE><PRE>\n" if $STATE == $PREFORMAT; $STATE = $MONO; } sub bynumber {$a <=> $b} sub Usage { $0 = $1 if $0 =~ /([^\/]*)$/; # basename print STDERR <<"EOF"; $0 usage: $0 [file(s)] [-v] [-m] [-u] [-s [-d directory -i indexprefix -p filesprefix]] EOF }