home *** CD-ROM | disk | FTP | other *** search
Perl Script | 1995-06-05 | 16.8 KB | 774 lines | [TEXT/R*ch] |
- #!/usr/bin/perl
- eval "exec /usr/bin/perl -S $0 $*"
- if $running_under_some_shell;
- #
- # Copyright Hans Verbrugge 1995
- # hans@solair1.inter.nl.net
- #
- # $Id: setext2html.pl,v 1.4 1995/06/04 13:20:39 hans Exp hans $
- #
- # this script translates setext formatted files
- # to html formatted files
- #
- #
- #
- # YOU, THE USER OF THIS SOFTWARE WILL USE THIS SOFTWARE AT YOUR OWN RISK.
- # IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- # DAMAGES.
- #
- # based on setext.pl,v 2.8
- # by Tony Sanders <sanders@earth.com>
- #
-
- #_______ NOT SUPPORTED (YET) _______
- # digests
- # multisetext
- # href_tt
- # hot_tt, will only be diplayed as strong emphasized.
-
- #_______ TODO ________
- # be more economical on memory usage
-
-
- #_______ HISTORY ________
-
- ######
- # v1.0, Hans Verbrugge
- # - initial version, never made it public
- #
-
- ######
- # v1.1b1, Hans Verbrugge
- # - implemented chapter (on title_tt) splitting
-
-
-
- ######
- # v1.1b2, Hans Verbrugge
- # - when splitting chapters , overview file gets the title of the
- # first chapter + "Overview"
- # - when splitting chapters, prefixes for output files
- # and the name of the index file
- # are userdefined
-
- ######
- # v1.1b3, Hans Verbrugge
- # - made preparations to make this run under Unix
-
- ######
- # v1.1b4, Hans Verbrugge
- # - url's are translated properly
-
- ######
- # v1.1b5, Hans Verbrugge
- # - url's are not properly translated when it contains a ':portnumber'.
- # fixed
- # - handle url's first, translate setext special chars
- # like '_' into escape sequences so that further parsing
- # leaves the url alone
-
- ######
- # v1.1, Robert A. Lentz <lentz@annie.astro.nwu.edu>
- # Hans Verbrugge
- # - fixed argument parsing
- # - fixed chdir bug
-
- ######
- # v1.2, Hans Verbrugge
- # - my perl version for linux (v5.000) goes bananas
- # when feeded large regular expression substitutions.
- # Those are now splitted into seperate subroutines.
- # (DoUrl/SubstUrl/DoMail/SubstMail)
-
- ######
- # v1.3, Hans Verbrugge
- # - when no files are specified on commandline
- # read from stdin
-
- ######
- # v1.4, Hans Verbrugge
- # - implemented quote-tt handling
-
- #_______ GLOBALS _______
- # comment if you're not running this on a mac
- $MACINTOSH=1;
-
- #_______ OPTIONS _______
-
- if (defined ($MACINTOSH)) {
- require "StandardFile.pl";
-
- # CONFIGURABLE PARAMETERS FOR MAC USERS
-
- # set to '0' if you don't want your chapters
- # stored into separate files
- $SPLIT_ON_CHAPTERS=0;
-
- # set to '0' if you don't want
- # mail addresses to be translated to the MAILTO: url
- $DO_MAIL = 1;
-
- # set to '0' if you don't want
- # url's to be translated.
- $DO_URL = 1;
-
- # set to '0' if you don't care what's going on :-)
- $VERBOSE = 1;
-
- # when splitting on chapters, this becomes the overview file,
- # rename it if you want another name
- $INDEX_FILE_PREFIX="index";
-
- # when splitting on chapters, this becomes the prefix name of
- # the output file, so in this case chapter 1 gets stored in ch-1.html
- # rename it if you want another name
- $CHAPTER_PREFIX="ch-";
-
- }
- else {
- # default settings for unix
- $SPLIT_ON_CHAPTERS=0;
- $DO_MAIL = 0;
- $DO_URL = 0;
- $VERBOSE = 0;
- $INDEX_FILE_PREFIX="index";
- $CHAPTER_PREFIX="ch-";
-
- #get current directory
- $DIR = `pwd`;
- chop ($DIR);
-
-
- }
- $SECTION=0;
- $CHAPTER_NO=0;
-
- $title="";
- $firstchapter="";
- $stylechange="";
-
- #_______ arrays _______
- %FILES=();
- %SPLITTED_CHAPTERS=();
- $lines="";
- $hrefs="";
-
-
- #_______ states _______
-
- $INDENT = 1;
- $MONO =2;
- $PREFORMAT = 3;
- $STATE = $INDENT;
- $UNFOLDING = 0;
-
-
- # html styles
- $html_addr= "ADDRES";
- $html= "HTML";
- $html_head= "HEAD";
- $html_body= "BODY";
- $html_title= "TITLE";
- $html_bold= "B";
- $html_italic= "I";
- $html_uline= "EM";
- $html_emph= "EM";
- $html_strongemph= "STRONG";
- $html_block= "BLOCKQUOTE";
- $html_preformat= "PRE";
- $html_paragraphbreak= "P";
-
- #_______ setext styles _______
- $title_tt= "^(={2,})\\s*\$";
- $subhead_tt= "^(-{2,})\\s*\$";
- $indent_tt= "^ ";
- $bold_tt= "\\*\\*([^\\*]*)\\*\\*";
- $italic_tt= "~([^\\s]*)~";
- $underline_tt= "_([^\\s]*)_";
- $hot_tt= "\\b([^\\s]*)_\\b";
- $include_tt= "^>\\s*(.*)";
- $bullet_tt= "^\\*\\s+(.*)";
- $quote_tt= "`_(\\b+)_`";
-
-
-
- #### MAIN ####
- while ($ARGV[0] =~ /^[^-].*/) {
- $_ = shift @ARGV;
- $FILES{$_}="";
-
- }
-
- if (! defined ($MACINTOSH)) {
- &parseopts();
- }
-
- # on a mac ask for input if no files were found in @ARGV
- if (! %FILES && defined ($MACINTOSH)) {
- $ifile = &StandardFile'GetFile("open source file:", "TEXT", "TEXT");
- $ifile eq '' && &Fail ("no input file given!\n");
- $FILES{$ifile} = "bla";
- }
-
- # on a mac set stderr unbuffered if we are in verbose mode
- if ($VERBOSE == 1 && defined ($MACINTOSH)) {
- select (stderr); $| = 1; select (stderr);
- }
-
- # on a mac let user choose a folder to save splitted files,
- # on unixbox directory was given on the commandline
- if ($SPLIT_ON_CHAPTERS == 1) {
- if (defined $MACINTOSH) {
- $DIR = &MacChooseOuput();
- }
- # split the chapters in directory $DIR
- &SplitChapters($DIR);
-
- # now change directory to $DIR
- chdir ($DIR) || &Fail ("cd $DIR: $!\n");
- }
- else {
- # force a loop when reading from stdin
- if (! %FILES) {$FILES{"USE SINPUT"} = "";}
- }
-
- print STDERR "parsing ...\n" if $VERBOSE;
-
- foreach $file (sort keys %FILES) {
- local ($ofile) = $FILES{$file};
-
- if (defined $MACINTOSH && ! $SPLIT_ON_CHAPTERS) {
- $ofile = &MacChooseOuput($file);
- }
- &do_it ($file, $ofile);
- }
- if ($SPLIT_ON_CHAPTERS) {
- foreach $file (sort keys %FILES) {
- unlink $file;
- }
- &BuildContents();
- }
-
- print STDERR "done\n" if $VERBOSE;
- exit 0;
-
- ##############
-
- sub parseopts
- {
- while (@ARGV) {
- $_ = shift (@ARGV);
-
- if (/^-v$/) {$VERBOSE = 1;}
- elsif (/^-m$/) {$DO_MAIL = 1;}
- elsif (/^-u$/) {$DO_URL = 1;}
- elsif (/^-s$/) {$SPLIT_ON_CHAPTERS = 1;}
- elsif (/^-d(.*)/) {$DIR = $1 ? $1 : shift(@ARGV);}
- elsif (/^-i(.*)/) {$INDEX_FILE_PREFIX = $1 ? $1 : shift(@ARGV);}
- elsif (/^-p(.*)/) {$CHAPTER_PREFIX = $1 ? $1 : shift(@ARGV);}
- elsif (/^-(.)/) {
- print STDERR "unrecognized option: $_\n";
- &Usage;
- exit (1);
- }
- else {
- print STDERR "unrecognized token: $_\n";
- &Usage();
- exit (1);
- }
- }
- }
-
-
- # build the contents file when splitting on chapters
- sub BuildContents
- {
- print STDERR "building ${INDEX_FILE_PREFIX}.html...\n" if $VERBOSE;
-
- open (CONTENTS,">${INDEX_FILE_PREFIX}.html") ||
- &Fail ("${INDEX_FILE_PREFIX}.html:$!\n");
- print CONTENTS <<"EOF";
- <${html}>
- <${html_head}>
- <${html_title}>${firstchapter} Overview</${html_title}>
- </${html_head}>
- <${html_body}>
- <H2><CENTER>${firstchapter} Overview</CENTER></H2><P>
- <HR>
- <UL>
- EOF
-
- # translate funny chars back to html
- grep (do {&XlateSpecialChars} && 0,%SPLITTED_CHAPTERS);
-
- foreach $key (sort bynumber keys %SPLITTED_CHAPTERS) {
- $file = ${CHAPTER_PREFIX} . ${key};
- print CONTENTS " <LI><A HREF=\"${file}.html\">$SPLITTED_CHAPTERS{$key}</A>\n";
- }
- print CONTENTS <<"EOF";
- </UL>
- <P>
- <HR>
- </${html_body}>
- </${html}>
- EOF
- }
-
-
-
- # do_it; parse input ifile and write to output file ofile
- sub do_it
- {
- local ($ifile,$ofile) = @_;
- local ($i);
- local ($key);
-
- if ($ifile eq "USE SINPUT") {
- open (IFILE, "<&STDIN") || &Fail ("couldn't dup stdin:$!\n");
- }
- else {
- open (IFILE,$ifile) || &Fail ("$file:$!\n");
- }
-
- # fill lines array;
-
- $lines = "";
- $#lines=0;
-
- while (<IFILE>) {
- chop;
- # eat white
- $_=~s/\s+$//go;
- $lines[$.-1]=$_;
- }
- close (IFILE);
-
- if ($ofile eq '') {
- open (OUTFILE, ">&STDOUT") || &Fail ("couldn't dup stdout:$!\n");
- }
- else {
- open (OUTFILE,">${ofile}") || &Fail ("Couldn't create ${ofile}:$!\n");
- }
-
- # translate special characters in
- # their html counterparts
- grep (do {&TranslateSpecialChars} && 0,@lines);
-
- # process subheads & titles
- &HandleSubheads;
-
- print " $title...\n" if $VERBOSE;
-
- # translate funny chars back to html
- grep (do {&TranslateSpecialCharsBack} && 0,@hrefs);
- # build the header
- &MakeHeader;
-
- print OUTFILE "<${html_body}>\n";
- print OUTFILE "<H2><CENTER>$title</CENTER></H2><P>\n";
- print OUTFILE "<HR>";
- print OUTFILE "<UL>\n";
- for ($i = 0; $i <= $#hrefs; $i++) {
- print OUTFILE " <LI>$hrefs[$i]\n";
- }
- print OUTFILE "</UL>\n";
- print OUTFILE "<P>";
- #print OUTFILE "<HR>";
-
- &ParseSetext;
-
- # translate special characters in
- # their html counterparts
- #&TranslateSpecialCharsBack;
- grep (do {&TranslateSpecialCharsBack} && 0,@lines);
-
- for ($i = 0; $i <= $#lines; $i++) {
- $_ = $lines[$i];
- print OUTFILE "$_\n" unless /^\.\. /;
- }
- print OUTFILE "</${html_body}>\n";
- close (OUTFILE);
- }
-
- sub CheckForSetext
- {
- local ($file) = @_;
-
- open (IFILE,$file) || &Fail ("Couldn't open $file:\n$!");
-
- while (<IFILE>) {
- if (/^==*/ || /^--*/) {
- close (IFILE);
- return 1;
- }
- }
- return 0;
- }
-
- sub Fail
- {
- local ($why)=@_;
-
- if (defined ($MACINTOSH))
- {
- &MacPerl'Answer ("$why", "Bye");
- exit;
- }
- else
- {
- die "$why";
- }
- }
-
- # open the input file, if script runs on a Mac
- # ask the user where to save the new file
- # otherwise print open STDOUT
-
- sub MacChooseOuput
- {
- local ($ifile) = @_;
-
- if (defined ($MACINTOSH))
- {
- local ($file,$ofile);
-
- if ($SPLIT_ON_CHAPTERS != 1) {
- $file = $1 if $ifile =~ /([^:]*)$/; # basename
-
- $ofile = &StandardFile'PutFile("Save htmlized version as:", "$file.html");
- exit unless $ofile;
- return $ofile;
- }
- else {
- local ($folder) = &StandardFile'GetFolder("save chapters where?");
- exit unless $folder;
- return $folder;
- }
- }
- }
-
- # translate special characters to a nonprintable char
- # so we can recognize it later.
- # primary use is to *not* screw up HREFS in chapter overviews
-
- sub XlateSpecialChars
- {
- s/&/&/go;
- s/</</go;
- s/>/>/go unless /^>/;
- s/"/"/go;
- s/ö/ö/go;
- s/ñ/ñ/go;
- s/È/È/go;
-
- }
-
- sub TranslateSpecialChars
- {
- s/&/\01/go;
- s/</\02/go;
- s/>/\03/go unless /^>/;
- s/"/\04/go;
- s/ö/\05/go;
- s/ñ/\06/go;
- s/È/\07/go;
-
- }
-
- sub TranslateSpecialCharsBack
- {
- s/\01/&/go;
- s/\02/</go;
- s/\03/>/go;
- s/\04/"/go;
- s/\05/ö/go;
- s/\06/ñ/go;
- s/\07/È/go;
- }
-
-
- sub SplitChapters
- {
- local ($dir) = @_;
- local ($file) = "";
- local ($i,$_);
- local ($firstLineIsChapter);
- local ($currentdir) = `pwd`;
-
- chop ($currentdir);
-
- print STDERR "splitting...\n" if $VERBOSE;
-
- # force a loop when no input files where given
- if (! %FILES) {$FILES{"USE SINPUT"} = "";}
-
- foreach $file (sort keys %FILES) {
- if ($file eq "USE SINPUT") {
- open (IFILE, "&STDIN") || &Fail ("couldn't dup stdin:$!\n");
- }
- else {
- open (IFILE,$file) || &Fail ("$file:$!\n");
- }
- chdir $dir || &Fail ("cd $dir:$!\n");
-
- # fill lines array;
- while (<IFILE>) {
- chop;
- # eat white
- $_=~s/\s+$//go;
- $lines[$.-1]=$_;
- }
- close (IFILE);
- $firstLineIsChapter = 0;
-
- for ($i = 0; $i <= $#lines; $i++) {
- $_ = $lines[$i];
-
- # check for title_tt
- if (/$title_tt/) {
- if (length ($_) == length ($lines[$i-1])) {
- # got a title_tt
-
- $i == 1 && ($firstLineIsChapter = 1);
- (($i > 1) && ($firstLineIsChapter == 0)) &&
- &Fail("$file: when splitting on chapters,\nfirst line in the file should be title_tt\n");
-
- $firstchapter = $lines[$i-1] unless $firstchapter;
- $lines[$i-1] = "\033\036" . $lines[$i-1];
-
- print STDERR " ", substr ($lines[$i-1],2), "\n" if $VERBOSE;
- }
- }
- }
- for ($i = 0; $i <= $#lines; $i++) {
- $_ = $lines[$i];
-
- if (s#^\033\036(.*)#$1#go) {
- close (CHAPTER) if ($CHAPTER_NO++ > 1);
- local ($splittedfile) = "${CHAPTER_PREFIX}${CHAPTER_NO}";
- open (CHAPTER,">${splittedfile}") || &Fail ("${splittedfile}:$!\n");
- $SPLITTED_CHAPTERS{"$CHAPTER_NO"}=$_;
- }
- print CHAPTER $_,"\n";
- }
- close (CHAPTER);
-
- # back to working direcory
- chdir($currentdir);
- }
- # rebuild %FILES array
- %FILES=();
-
- foreach $key (sort bynumber keys %SPLITTED_CHAPTERS) {
- $file = ${CHAPTER_PREFIX} . ${key};
- $FILES{$file} = "${file}.html"
- }
- }
-
-
-
- # handle subheads & titles;
- # the lines are stripped from trailing whitespace,
- # if a line is followed by a line containing a equal
- # number of '=' or '-' starting at position 1,
- # then that line is a valid title_tt or subhead_tt.
- sub HandleSubheads
- {
- local ($issetext)=0;
- local ($y)=0;
- local ($i,$_);
-
- $title="";
- $#hrefs=0;
- $SECTION = 0;
-
- for ($i = 0; $i <= $#lines; $i++) {
- $_ = $lines[$i];
-
- # check for title_tt
- if (/$title_tt/) {
- if (length ($_) == length ($lines[$i-1])) {
- $SECTION++;
- $issetext = 1;
- $title = $lines[$i-1] unless $title ne "";
- $hrefs[$y++]="<A HREF=\"\#sub${SECTION}\">$lines[$i-1]</A>";
- $lines[$i-1]=~s#(.*)# <HR>\n <H2><A NAME=\"sub${SECTION}\">$1</A></H2>#;
- $lines[$i] = '.. ';
- next;
- }
- }
-
- # check for subhead_tt
- elsif (/$subhead_tt/) {
- if (length ($_) == length ($lines[$i-1])) {
- $SECTION++;
- $issetext = 1;
- $hrefs[$y++]="<A HREF=\"\#sub${SECTION}\">$lines[$i-1]</A>";
- $lines[$i-1]=~s#(.*)# <HR>\n <H3><A NAME=\"sub${SECTION}\">$1</A></H3>#;
- $lines[$i] = '.. ';
- next;
- }
- }
- }
-
- # if we don't have a setext file, die!
- &Fail ("not a setext file!\n") unless $issetext == 1;
-
- }
-
- sub MakeHeader
- {
- print OUTFILE "<${html_head}>\n";
- &PrintHtmlStyle ($html_title, $title) unless ($title eq "");
- print OUTFILE "</${html_head}>\n";
- }
-
- sub ParseSetext
- {
- local ($i, $_);
-
- for ($i = 0; $i <= $#lines; $i++) {
- $_ = $lines[$i];
-
- next if (/^\.\. /);
-
- if ($STATE == $INDENT && /^$/) {
- $lines[$i] .= "\n<${html_paragraphbreak}>\n" unless $UNFOLDING++;
- #print OUTFILE "<${html_paragraphbreak}>\n" unless $UNFOLDING++;
- next;
- }
- $UNFOLDING = 0;
-
- # state transitions
- if (/^>\s/) { &to_mono; }
- elsif (/^ [^ ]/) { &to_fmt; }
- else { &to_pre; }
-
- s/^>\s*//; # fix quote-tt
- s/^ ([^ ])/\1/; # fix indent-tt
-
- # quote-tt
- &DoQuote();
-
- # urls
- $DO_URL == 1 && &DoUrl();
-
- # mail adresses
- $DO_MAIL == 1 && &DoMail();
-
-
-
- # bullet_tt
- s#^\*\s+(.*)#<UL><LI>$1</UL>#o;
-
- # bold_tt
- s#\*\*([^\*]*)\*\*#<$html_bold>$1</$html_bold>#go;
-
- # italic_tt
- s#~([^\s]*)~#<$html_italic>$1</$html_italic>#go;
-
- # underline_tt
- s#_([^\s]*)_#($a = $1) =~ s,_, ,g; "\<$html_uline\>$a\<\/$html_uline\>"; #ge;
-
- # hot_tt
- s#\b([^\s]*)_\b#<$html_strongemph>$1</$html_strongemph>#go;
-
- s,\0331,_,go;
- s,\0332,~,go;
- s,\0333,",go;
- s,\0334,*,go;
- $lines[$i] = join ('',$stylechange,$_);
- }
- }
-
- sub DoUrl
- {
- s#(ftp|http|wais|telnet|gopher|news)(://[\w-\+]+)(([\.~][\w-\+/\*:\?_\#]+){0,})#&SubstUrl();#ge;
- }
-
- sub SubstUrl
- {
- local ($a) = join ('',$1,$2,$3);
- $a =~ s,_,\0331,go;
- $a =~ s,~,\0332,go;
- return "<A HREF=\0333$a\0333>$a</A>";
- }
-
- sub DoMail
- {
- s#([\w-\+.!]+[@!][\w-\+]+)((\.[\w-\+]+){0,})#&SubstMail();#ge;
- }
-
- sub SubstMail
- {
- local ($a) = join ('',$1,$2);
- $a =~ s,_,\0331,go;
- $a =~ s,~,\0332,go;
- return "<A HREF=\0333mailto:$a\0333>$a</A>";
- }
-
-
- sub DoQuote
- {
- # quoted typotags should be passed unmodified,
- # change typetags special chars to escape sequences
-
- s#'_(.*)!_'#&SubstQuote();#ge;
- }
-
-
- sub SubstQuote
- {
- local ($a) = $1;
- $a =~ s,_,\0331,go;
- $a =~ s,~,\0332,go;
- $a =~ s,\*,\0334,go;
-
- return "$a";
- }
-
- sub PrintHtmlStyle
- {
- local ($style, $what) = @_;
-
- print OUTFILE "\<$style\>$what\<\/$style\>\n";
- }
-
- sub to_fmt {
- $STATE == $INDENT && do {$stylechange = ""; return};
-
- $stylechange = "</PRE>\n" if $STATE == $PREFORMAT;
- $stylechange = "</PRE></BLOCKQUOTE>\n" if $STATE == $MONO;
-
- $STATE = $INDENT;
- }
- sub to_pre {
- $STATE == $PREFORMAT && do {$stylechange = ""; return};
-
- $stylechange = "<PRE>\n" if $STATE == $INDENT;
- $stylechange = "</PRE></BLOCKQUOTE><PRE>\n" if $STATE == $MONO;
-
- $STATE = $PREFORMAT;
- }
- sub to_mono {
- $STATE == $MONO && do {$stylechange = ""; return};
-
- $stylechange = "<BLOCKQUOTE><PRE>\n" if $STATE == $INDENT;
- $stylechange = "</PRE><BLOCKQUOTE><PRE>\n" if $STATE == $PREFORMAT;
-
- $STATE = $MONO;
- }
-
- sub bynumber {$a <=> $b}
-
- sub Usage
- {
- $0 = $1 if $0 =~ /([^\/]*)$/; # basename
-
- print STDERR <<"EOF";
- $0 usage: $0 [file(s)] [-v] [-m] [-u]
- [-s [-d directory -i indexprefix -p filesprefix]]
- EOF
- }
-