home *** CD-ROM | disk | FTP | other *** search
/ OS/2 Shareware BBS: 5 Edit / 05-Edit.zip / catdo_35.zip / catdoc.c < prev    next >
C/C++ Source or Header  |  1999-12-17  |  11KB  |  373 lines

  1.  
  2. /* .................................................... documentation ...
  3.  *
  4.  * You probably want to #define LATIN1 (commented out), otherwise you
  5.  * get cyrillic code page translations.  I think that's the only
  6.  * user-servicable part.  The author's original message follows:
  7.  *
  8.  *
  9.  * Usage notes
  10.  *
  11.  * -t switch causes replacing of special symbols such as em-dash by
  12.  *    TeX (LaTeX) commands instead of ASCII printable equivalents
  13.  * -a disables effect of prevouisly specified -t
  14.  *
  15.  * -w disables wordwrap - prints paragraphs as long lines.
  16.  *  
  17.  *
  18.  * -s switch: if program cannot find MS-Word signature before
  19.  * first pritable paragraph, it exits with code 1, supposing that it is
  20.  * just plain text which has .doc suffix only by coincedence.
  21.  *
  22.  *
  23.  */
  24.  
  25. /* .................................................... program start ... */
  26.  
  27. /* catdoc.c version 0.3
  28.  *
  29.  * $Id: catdoc.c,v 0.35 1998/06/05 14:07:08 vitus Exp vitus $
  30.  *
  31.  *
  32.  */
  33.  
  34. /* .......................................................... include ... */
  35.  
  36. #include <stdio.h>
  37. #include <stdlib.h>
  38. #include <string.h>
  39.  
  40. /* ........................................................... define ... */
  41.  
  42. #define TEXT_WIDTH 72
  43. #ifdef unix
  44. #define BUFFER_SIZE 262144
  45. #else
  46. #define BUFFER_SIZE 16384
  47. #endif
  48. /* enable this define, if you don't want cyrillic code page translations */
  49. /* #define LATIN1*/
  50.  
  51.  
  52. /* ......................................................... charsets ... */
  53.  
  54.  
  55. unsigned char specs[] =
  56. { 7, /* tab columns separator - handled specially*/
  57.       '\n',/* hook to handle end of line in tables */
  58.       0x1E,/* unbreakable defis */
  59.       0x1F,/* soft hyphen */
  60.       0x85,/* dots */
  61.       0x91,/* opening single quote */
  62.       0x92,/* closing single quote */
  63.       0x93,/* opening double quote */
  64.       0x94,/* closing double quote */
  65.       0x96,/* em-dash (or em-space)*/
  66.       0x97,/* en-dash */
  67.       0x99,/* Trade Mark sign */
  68.       0xA0,/* unbreakable space */
  69.       0xA9,/* Copyright sign */
  70.       0xAE,/* Reserved sign */
  71.       0xAB,/* opening << quote*/
  72.       0xBB,/* closing >> quote*/
  73.       '\r',/* Ignore paragraph end in tables*/
  74.       /* The rest is translated into itself unless TeX mode is selected */
  75.       '%','$','_','{','}','\\',
  76.       0 /* To terminate the string, becouse I'm using strchr to search in it*/ 
  77. };
  78.  
  79. unsigned char *ascii_specs[]=
  80. {
  81.     "\t","\n","-","","...","`","'","``","''","-","-","tm",
  82.     " ","(c)","(R)","\"","\""," ","%","$","_","{","}","\\",
  83.     0
  84. };
  85.  
  86.  
  87. unsigned char *TeX_specs[]=
  88. {
  89.     "\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--",
  90.     "${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/
  91.     "~",
  92.     "{\\copyright}",
  93.     "\\circledchar{R}",/* specific to teTeX */
  94.  
  95.     "<",">",          /* specific to Urbansoft teTeX russification */
  96.     " ",
  97.     "\\%","\\$","\\_","$\\{$","$\\}$","$\\backslash$",
  98.     0
  99. };
  100.  
  101. /*********************************************************************/
  102. /**  code_pade translation                                          **/
  103. /*********************************************************************/
  104. #ifndef LATIN1
  105. #ifdef unix
  106.  
  107.  
  108. unsigned char table[256]=
  109. {
  110. /* Windows cyrillic code page to KOI-8 */
  111. 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F,
  112. 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20,
  113. 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
  114. 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
  115. 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
  116. 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
  117. 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
  118. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
  119. 0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
  120. 0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
  121. 0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE,
  122. 0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B,
  123. 0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
  124. 0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
  125. 0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
  126. 0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1};
  127.  
  128.  
  129. #else
  130.  
  131. unsigned char table[256]=
  132. {
  133. 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f,
  134. 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20,
  135. 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
  136. 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
  137. 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
  138. 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
  139. 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
  140. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
  141. 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
  142. 0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
  143. 0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf,
  144. 0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf,
  145. 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
  146. 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
  147. 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
  148. 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef};
  149.  
  150. #endif
  151.  
  152. #define recode_char(x) table[x]
  153.  
  154. #else
  155. #define recode_char(x) x
  156. #endif
  157.  
  158. /* global flag ---- */
  159. int nowrap=0;
  160. /* ............................................................. func ... */
  161. unsigned char *map_char(unsigned char **map,int c)
  162. {
  163.     static      unsigned char    buffer[2]="a";
  164.                 unsigned char    *ptr;
  165.  
  166.     if ( ( ptr = strchr( specs, c)) )
  167.     {
  168.         return map[ ptr - specs ];
  169.     }
  170.     else
  171.     {
  172.         buffer[0]=recode_char(c);
  173.         return buffer;
  174.     }
  175. }
  176.  
  177.  
  178. /* ............................................................. func ... */
  179. void format( unsigned char *buf, unsigned char **map)
  180. {
  181.     unsigned char    outstring[128];
  182.     unsigned char    *sp = buf, *dp;
  183.     int     table = 0;
  184.  
  185.     outstring[0] = '\0';                        /* clear as "" */
  186.  
  187.     while (*sp)
  188.     {
  189.         if (*sp==7&&table)
  190.         {
  191.             printf("%s%s",outstring,map_char(map,'\n'));
  192.             outstring[0]=0;
  193.             table=0;sp++;
  194.         }
  195.         else
  196.         {   
  197.             if ( strlen( strcat( outstring, map_char( map ,*sp))) > TEXT_WIDTH)
  198.             { 
  199.                if (nowrap) {
  200.                  printf("%s",outstring);
  201.                  *outstring=0;
  202.                } else {
  203.                 dp = strrchr(outstring,' ');
  204.                 if (dp)
  205.                 {
  206.                     *(dp++)=0;
  207.                     printf("%s\n",outstring);
  208.                     strcpy(outstring,dp);
  209.                 }
  210.                 else
  211.                 {
  212.                     int i;
  213.                     for(i=0;i<TEXT_WIDTH;i++) putc(outstring[i],stdout);
  214.                     putc('\n',stdout);
  215.                     strcpy(outstring,outstring+72);
  216.                 }
  217.               }
  218.             }
  219.             table=*(sp++)==7;
  220.         }
  221.     }
  222.     if (nowrap) {
  223.        if (outstring[0]!=0) {
  224.           printf("%s\n", outstring);
  225.        }
  226.     } else {
  227.     if (outstring[0]==0)
  228.         putc('\n',stdout);
  229.     else
  230.         printf("%s\n\n",outstring);
  231.     }
  232. }
  233.  
  234. /* ............................................................. func ... */
  235. void help(void)
  236. {
  237.     printf(
  238.     "catdoc - exctract text from MS-Word files and catenate it to stdout\n"
  239.     "Copyright (c) by Victor B. Wagner, 1996\n"
  240.     "Usage catdoc [-ast] files ...\n"
  241.     "\t-a - converts non-standard printable chars into readable form (default)\n"
  242.     "\t-t - converts them into TeX control sequences\n"
  243.     "\t-w - disables word wrapping\n"
  244.     "\t-s - exits with code 1 if MSWordDoc signature not found before\n"
  245.     "\t\tfirst printable paragraph\n\n"
  246.     "All options affects only files, specified AFTER them\n"
  247.            );
  248.     exit(2);
  249. }
  250.  
  251. /* ............................................................. func ... */
  252.  
  253. unsigned char buf[BUFFER_SIZE];
  254.  
  255. void do_file(FILE *f, unsigned char **map, int search_sign)
  256. {
  257.     int ok =! search_sign;
  258.     int bufptr, c;
  259.  
  260.     while( !feof(f) )
  261.     {
  262.         bufptr = -1;
  263.  
  264.         do {
  265.  
  266.             c = getc(f);
  267.  
  268.             /* Special printable symbols 7- table separator
  269.              *
  270.              * \r   - paragraph end
  271.              * 0x1E - short defis
  272.              *
  273.              */
  274.  
  275.             if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E)
  276.                 buf[++bufptr]=c;
  277.             else
  278.                 if (c==0x0b)
  279.                     buf[++bufptr]='\r';
  280.                 else
  281.                 {
  282.                     if (!c)
  283.                     {
  284.                         buf[++bufptr]=0;
  285.                         if(!strcmp(buf,"MSWordDoc"))
  286.                         {
  287.                             ok=1;
  288.                         }
  289.                     }
  290.                     if (c!=2)/* \002 is Word's footnote mark */
  291.                         bufptr=-1; /*all other special symbols
  292.                                      discard buffe */
  293.                 }
  294.         }
  295.  
  296.         while (c!='\r'&&c!=EOF);
  297.         if (c==EOF && !ok) exit(1);
  298.         if (bufptr>0&&buf[bufptr]=='\r')
  299.         {
  300.             if (!ok)
  301.                 exit( 1);
  302.             buf[bufptr]=0;
  303.             format(buf,map);
  304.         }
  305.     }
  306. }
  307.  
  308. /* ............................................................. func ... */
  309.  
  310. int main(int argc,char **argv)
  311. {
  312.     /* search_sign:
  313.      *     Must program exit with exit code 1 if MSWordDoc
  314.      *     signature is not found?
  315.      *
  316.      * sequences:
  317.      *     pointer to array of character sequences
  318.      *     to represent special characters of Word
  319.      */
  320.  
  321.  
  322.     int     search_sign = 0;
  323.     unsigned char    **sequences = ascii_specs;
  324.     int     i= 1,
  325.             stdin_processed=0;
  326.  
  327.     if (argc<2)
  328.     {
  329.         help();
  330.     }
  331.  
  332.     for(;i<argc;i++)
  333.     {
  334.         if (!strcmp(argv[i],"-s"))
  335.             search_sign=1;
  336.         else if (!strcmp(argv[i],"-t"))
  337.             sequences=TeX_specs;
  338.         else if (!strcmp(argv[i],"-a"))
  339.             sequences=ascii_specs;
  340.         else if (!strcmp(argv[i],"-w"))
  341.             nowrap=1;
  342.         else if (!strcmp(argv[i],"-"))
  343.             if (!stdin_processed)
  344.             {
  345.                 do_file(stdin,sequences,search_sign);
  346.                 stdin_processed=1;
  347.             }
  348.             else
  349.             {
  350.                 fprintf(stderr,"Cannot process standard input twice a row\n");
  351.                 exit (2);
  352.             }
  353.         else if (argv[i][0]=='-')
  354.         {
  355.             fprintf(stderr,"Invalid option %s\n",argv[i]);
  356.             help();
  357.         }
  358.         else
  359.         {
  360.             FILE *f=fopen(argv[i],"rb");
  361.             if(!f)
  362.             {
  363.                 fprintf(stderr,"Cannot open file %s\n",argv[i]);
  364.                 exit(2);
  365.             }
  366.             do_file(f,sequences,search_sign);
  367.         }
  368.     }
  369.     return 0;
  370. }
  371.  
  372. /* end of file */
  373.