home *** CD-ROM | disk | FTP | other *** search
- /*
- * Splits an HTML file into several files and updates
- * hypertext links accordingly.
- *
- * Restrictions:
- *
- * 1. The ``<A NAME=...>'' and ``<A HREF=...>''
- * anchors MUST be found verbatim, i.e. without
- * excess whitespace and *not* split between 2
- * adjacent lines. If the HTML file was produced
- * by Makeinfo, you should use the @w{} directive
- * judiciously to prevent line-filling mechanism
- * from splitting the anchors between lines.
- * 2. Currently only supports splitting the file one
- * node per file; you cannot split the file by
- * chapters. The string which signals the beginning
- * of a new node is hard-wired into the program and
- * cannot be changed without recompiling.
- *
- *
- * Author: Eli Zaretskii <eliz@is.elta.co.il>
- *
- * Version: 1.1
- *
- * Last updated: 22 June, 1996
- *
- * ----------------------------------------------------------
- *
- * You can do whatever you like with this program, except:
- * (1) preventing other people (including the author) do
- * whatever they like, and (2) removing the author and
- * version info above.
- *
- * ----------------------------------------------------------
- *
- */
-
- #include <stdio.h>
- #include <unistd.h>
- #include <stdlib.h>
- #include <string.h>
- #include <errno.h>
- #include <fcntl.h>
-
- #ifdef __DJGPP__
-
- #include <io.h>
-
- /* Make so our start-up code is minimal: disable filename
- globbing, and don't load environment file. */
- #include <crt0.h>
-
- char ** __crt0_glob_function(char *arg) { return (char **)0; }
- void __crt0_load_environment_file(char *app_name) {}
-
- #else /* not __DJGPP__ */
-
- /* Some Unix boxes don't have functon prototypes on the header files.
- -Wall will complain about this, so here are the prototypes: */
-
- void perror (const char *);
- int fprintf(FILE *, const char *, ...);
-
- /* Non-DJGPP libraries might not have these two functions. */
-
- #include <ctype.h>
-
- int
- strnicmp(const char *s1, const char *s2, size_t n)
- {
-
- if (n == 0)
- return 0;
- do {
- if (tolower(*s1) != tolower(*s2++))
- return (int)tolower(*s1) - (int)tolower(*--s2);
- if (*s1++ == 0)
- break;
- } while (--n != 0);
- return 0;
- }
-
- #include <sys/types.h>
- #include <sys/stat.h>
-
- long
- filelength(int fd)
- {
- struct stat stbuf;
-
- if (fstat(fd, &stbuf) == 0)
- return stbuf.st_size;
-
- return -1;
- }
-
- #endif /* not __DJGPP__ */
-
- #ifndef O_BINARY
- #define O_BINARY 0
- #endif
-
- static const char split_marker[] = "<P> | <A HREF=\"#";
- static const char dest_marker[] = "<A NAME=\"";
- static const char link_marker[] = "<A HREF=\"#";
- static size_t split_marker_len = sizeof(split_marker) -1;
- static size_t dest_marker_len = sizeof(dest_marker) - 1;
- static size_t link_marker_len = sizeof(link_marker) - 1;
-
- /* Is POINT at the first character of STRING whose length is LEN? */
- int
- looking_at(const char string[], size_t len, char *point)
- {
- return strnicmp(string, point, len) == 0;
- }
-
- /* Record a position where we'll split the file, bump point. */
- static int *split_pos_table; /* table of split positions */
- static int split_pos_table_size; /* the size of the table */
- static int split_pos_idx; /* index of next free slot */
-
- size_t
- remember_split_pos(size_t pos)
- {
- if (split_pos_idx >= split_pos_table_size)
- {
- if (split_pos_table)
- split_pos_table =
- (int *)realloc(split_pos_table,
- (split_pos_table_size *= 2)*sizeof(size_t));
- else
- {
- split_pos_table_size = 100;
- split_pos_table = (int *)malloc(split_pos_table_size*sizeof(size_t));
- }
-
- if (split_pos_table == (int *)0)
- {
- errno = ENOMEM;
- perror("split_pos table");
- exit(2);
- }
- }
-
- split_pos_table[split_pos_idx++] = pos;
-
- return split_marker_len;
- }
-
- /* Return the file position where subfile FILENO ends. */
- size_t
- get_split_pos(int fileno)
- {
- return split_pos_table[fileno];
- }
-
- /* Record an anchor name and its subfile number, bump point. */
- struct _dest_pos {
- char *name;
- int fileno;
- };
- static struct _dest_pos *dest_pos_table; /* table of anchors */
- static int dest_pos_table_size; /* the size of the table */
- static int dest_pos_idx; /* index of next free slot */
-
- int
- remember_dest_pos(char *p, int fileno)
- {
- char *save_point = p;
- char *name_start;
-
- if (dest_pos_idx >= dest_pos_table_size)
- {
- if (dest_pos_table)
- dest_pos_table = (struct _dest_pos *)
- realloc(dest_pos_table,
- (dest_pos_table_size *= 2)*sizeof(struct _dest_pos));
- else
- {
- dest_pos_table_size = 100;
- dest_pos_table = (struct _dest_pos *)
- malloc(dest_pos_table_size*sizeof(struct _dest_pos));
- }
-
- if (dest_pos_table == (struct _dest_pos *)0)
- {
- errno = ENOMEM;
- perror("dest_pos table");
- exit(2);
- }
- }
-
- p += dest_marker_len;
- name_start = p;
- while (*p !='"')
- p++;
-
- dest_pos_table[dest_pos_idx].fileno = fileno;
- dest_pos_table[dest_pos_idx].name = (char *)malloc(p - name_start + 1);
- if (dest_pos_table[dest_pos_idx].name == (char *)0)
- {
- errno = ENOMEM;
- perror("name in dest_pos table");
- exit(2);
- }
- strncpy(dest_pos_table[dest_pos_idx].name, name_start, p - name_start);
- dest_pos_table[dest_pos_idx++].name[p - name_start] = '\0';
-
- return p - save_point;
- }
-
- /* Skip ``<A HREF="'', return pointer to beginning of anchor name. */
- char *
- skip_until_anchor_name(char *point)
- {
- return point + link_marker_len;
- }
-
- /* Which subfile is this anchor in? */
- int
- subfile_num_for_anchor_at_point(char *point)
- {
- char c, *name_start = point;
- int idx = 0;
-
- while (*point != '"')
- point++;
-
- for (c = *name_start; idx < dest_pos_idx; idx++)
- {
- register char *anchor = dest_pos_table[idx].name;
-
- if (anchor[0] == c)
- {
- size_t len = strlen(anchor);
-
- /* Be careful not to catch possible substrings! */
- if (len == point - name_start
- && strncmp(anchor, name_start, len) == 0)
-
- return dest_pos_table[idx].fileno;
- }
- }
-
- fprintf(stderr, "%.*s: not found in table of anchors\n",
- (int)(point - name_start), name_start);
- exit(2);
- }
-
- int
- main(int argc, char *argv[])
- {
- if (argc == 3)
- {
- int in_fd = open(argv[1], O_RDONLY | O_BINARY);
- int out_fd;
- long fsize, actual_size;
- char *in_file;
- char *p, *last_p, *from;
- int subfile = 0;
- char subfile_name[FILENAME_MAX];
- int max_digits = 1;
- size_t split_pos;
-
- /* First, read the file. */
-
- if (in_fd < 0)
- {
- perror(argv[1]);
- return 2;
- }
-
- fsize = filelength(in_fd);
-
- in_file = (char *)malloc(fsize + 1); /* leave place for `\0' */
- if (in_file == (char *)0)
- {
- errno = ENOMEM;
- perror(argv[1]);
- return 2;
- }
-
- if ((actual_size = read(in_fd, in_file, fsize)) != fsize)
- {
- if (actual_size <= 0)
- {
- perror(argv[1]);
- return 2;
- }
- fprintf(stderr, "%s: size is %ld, but only %ld bytes read\n",
- argv[1], fsize, actual_size);
- fsize = actual_size;
- }
-
- close (in_fd);
-
- for (p = in_file + fsize - 1; *p == 0x1a && p > in_file; --p)
- {
- fsize--;
- actual_size--;
- }
-
- if (fsize < 2048)
- {
- fprintf(stderr, "%s: too small to bother\n", argv[1]);
- return 3;
- }
-
- p[1] = '\0';
-
- /* Pass 1: Determine the file positions where the file
- will be split, and remember positions of the
- <A NAME="#dest"> destination anchors. */
-
- for (last_p = p, p = in_file; p < last_p; )
- {
- if (*p == '\n' && looking_at(split_marker, split_marker_len, ++p))
- {
- p += remember_split_pos(p - in_file);
- subfile++;
- }
- else if (looking_at(dest_marker, dest_marker_len, p))
- {
- p += remember_dest_pos(p, subfile);
- }
- else
- ++p;
- }
-
- /* Last subfile ends at EOF. */
- remember_split_pos(p - in_file);
- subfile++;
-
- while (subfile /= 10)
- max_digits++;
-
- /* Pass 2: Generate the subfiles with updated links. */
-
- sprintf(subfile_name, "%s.html", argv[2]);
- if ((out_fd = open(subfile_name,
- O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666)) == -1)
- {
- perror(subfile_name);
- return 2;
- }
- split_pos = get_split_pos(subfile);
-
- for (p = in_file, from = p; p < last_p; ++p)
- {
- if (p - in_file >= split_pos) /* time to start another file */
- {
- if (write(out_fd, from, split_pos - (from - in_file)) <= 0)
- {
- perror("write at split position");
- return 2;
- }
- close(out_fd);
- from = in_file + split_pos;
- split_pos = get_split_pos(++subfile);
- sprintf(subfile_name, "%s%.*d.html",
- argv[2], max_digits, subfile);
- if ((out_fd = open(subfile_name,
- O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
- 0666)) == -1)
- {
- perror(subfile_name);
- return 2;
- }
- }
- else if (looking_at(link_marker, link_marker_len, p))
- {
- int which_file;
-
- p = skip_until_anchor_name(p);
- which_file = subfile_num_for_anchor_at_point(p);
-
- --p; /* the `#' character goes AFTER the file */
-
- sprintf(subfile_name, which_file ? "%s%.*d.html" : "%s.html",
- argv[2], max_digits, which_file);
- if (write(out_fd, from, p - from) <= 0 ||
- write(out_fd, subfile_name, strlen(subfile_name)) <= 0)
- {
- perror("write at anchor name");
- return 2;
- }
- from = p;
- }
- }
-
- if (p != from)
- if (write(out_fd, from, p - from) <= 0)
- {
- perror("write at EOF");
- return 2;
- }
-
- fprintf(stderr, "%s was split into %d file%s\n",
- argv[1], subfile + 1, subfile ? "s" : "");
-
- return 0;
- }
- else
- {
- fprintf(stderr, "Usage: %s inputfile outbase\n", *argv);
- return 1;
- }
- }
-