home *** CD-ROM | disk | FTP | other *** search
/ OS/2 Shareware BBS: 5 Edit / 05-Edit.zip / catdo_35.zip / catdoc0_35s.c < prev    next >
C/C++ Source or Header  |  1999-12-17  |  15KB  |  438 lines

  1. /* .................................................... documentation ...
  2.  *
  3.  * You probably want to #define LATIN1 (commented out), otherwise you
  4.  * get cyrillic code page translations.  I think that's the only
  5.  * user-servicable part.  The author's original message follows:
  6.  *
  7.  *
  8.  * Usage notes
  9.  *
  10.  * -t switch causes replacing of special symbols such as em-dash by
  11.  *    TeX (LaTeX) commands instead of ASCII printable equivalents
  12.  * -a disables effect of prevouisly specified -t
  13.  *
  14.  * -w disables wordwrap - prints paragraphs as long lines.
  15.  *  
  16.  *
  17.  * -s switch: if program cannot find MS-Word signature before
  18.  * first pritable paragraph, it exits with code 1, supposing that it is
  19.  * just plain text which has .doc suffix only by coincedence.
  20.  *
  21.  * -------------------------------------------------------------------
  22.  *
  23.  * This file has been modified by Stefan Schwarzer <s.schwarzer@ndh.net>
  24.  * to allow for successive conversion from latin1 to cp437 or cp850.
  25.  * This is accomplished by using one (more) table translation if options
  26.  * -4 (convert to cp437) or -8 (convert to cp850) are selected. Default
  27.  * is to use none of these translations.
  28.  *
  29.  * The tables were generated by recode latin1:cp437 and recode latin1:cp850,
  30.  * respectively, and a small python script.
  31.  */
  32.  
  33. /* .................................................... program start ... */
  34.  
  35. /* catdoc.c version 0.3
  36.  *
  37.  * $Id: catdoc.c,v 0.35 1998/06/05 14:07:08 vitus Exp vitus $
  38.  *
  39.  *
  40.  */
  41.  
  42. /* .......................................................... include ... */
  43.  
  44. #include <stdio.h>
  45. #include <stdlib.h>
  46. #include <string.h>
  47.  
  48. /* ........................................................... define ... */
  49.  
  50. #define TEXT_WIDTH 72
  51. #if (defined unix) || (defined OS2)
  52. #define BUFFER_SIZE 262144
  53. #else
  54. #define BUFFER_SIZE 16384
  55. #endif
  56. /* enable this define, if you don't want cyrillic code page translations */
  57. #define LATIN1
  58.  
  59.  
  60. /* ......................................................... charsets ... */
  61.  
  62. unsigned char specs[] =
  63. { 7, /* tab columns separator - handled specially*/
  64.       '\n',/* hook to handle end of line in tables */
  65.       0x1E,/* unbreakable defis */
  66.       0x1F,/* soft hyphen */
  67.       0x85,/* dots */
  68.       0x91,/* opening single quote */
  69.       0x92,/* closing single quote */
  70.       0x84,/* opening double quote */
  71. /*    0x93,/* opening double quote */
  72.       0x93,/* closing double quote */
  73. /*    0x94,/* closing double quote */
  74.       0x96,/* em-dash (or em-space)*/
  75.       0x97,/* en-dash */
  76.       0x99,/* Trade Mark sign */
  77.       0xA0,/* unbreakable space */
  78.       0xA9,/* Copyright sign */
  79.       0xAE,/* Reserved sign */
  80.       0xAB,/* opening << quote*/
  81.       0xBB,/* closing >> quote*/
  82.       '\r',/* Ignore paragraph end in tables*/
  83.       /* The rest is translated into itself unless TeX mode is selected */
  84.       '%','$','_','{','}','\\',
  85.       0 /* To terminate the string, because I'm using strchr to search in it*/ 
  86. };
  87.  
  88. unsigned char *ascii_specs[]=
  89. {
  90.     "\t","\n","-","","...","`","'","``","''","-","-","tm",
  91.     " ","(c)","(R)","\"","\""," ","%","$","_","{","}","\\",
  92.     0
  93. };
  94.  
  95.  
  96. unsigned char *TeX_specs[]=
  97. {
  98.     "\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--",
  99.     "${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/
  100.     "~",
  101.     "{\\copyright}",
  102.     "\\circledchar{R}",/* specific to teTeX */
  103.  
  104.     "<",">",          /* specific to Urbansoft teTeX russification */
  105.     " ",
  106.     "\\%","\\$","\\_","$\\{$","$\\}$","$\\backslash$",
  107.     0
  108. };
  109.  
  110. /*********************************************************************/
  111. /**  code_page translation                                          **/
  112. /*********************************************************************/
  113. #ifndef LATIN1
  114.  
  115. #ifdef unix
  116.  
  117. unsigned char table[256]=
  118. {
  119. /* Windows cyrillic code page to KOI-8 */
  120. 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F,
  121. 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20,
  122. 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
  123. 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
  124. 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
  125. 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
  126. 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
  127. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
  128. 0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
  129. 0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
  130. 0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE,
  131. 0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B,
  132. 0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
  133. 0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
  134. 0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
  135. 0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1};
  136.  
  137. #else
  138.  
  139. unsigned char table[256]=
  140. {
  141. 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f,
  142. 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20,
  143. 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
  144. 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
  145. 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
  146. 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
  147. 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
  148. 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
  149. 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
  150. 0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
  151. 0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf,
  152. 0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf,
  153. 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
  154. 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
  155. 0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
  156. 0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef};
  157.  
  158. #endif
  159.  
  160. #define recode_char(x) table[x]
  161.  
  162. #else
  163.  
  164. /* LATIN1 defined */
  165. #define recode_char(x) x
  166.  
  167. #endif
  168.  
  169. /* We need only ASCII codes >= 128, the characters below 128 are the same for
  170.  * latin1, cp437, and cp850.
  171.  */
  172. unsigned char table_latin1_to_cp437[128]=
  173. {
  174. 0xc7,0xfc,0xe9,0xe2,0xe4,0xe0,0xe5,0xe7,0xea,0xeb,0xe8,0xef,0xee,0xec,0xc4,
  175. 0xc5,0xc9,0xe6,0xc6,0xf4,0xf7,0xf2,0xfb,0xb7,0xe1,0xd6,0xdc,0xf3,0xfa,0xd1,
  176. 0x9e,0x9f,0xff,0xad,0x9b,0x9c,0xb1,0x9d,0xbc,0xba,0xbf,0xa9,0xa6,0xae,0xaa,
  177. 0xed,0xbd,0xbb,0xb0,0xf1,0xfd,0xb3,0xb4,0xb5,0xb6,0xf9,0xb8,0xb9,0xa7,0xaf,
  178. 0xac,0xab,0xbe,0xa8,0xc0,0xc1,0xc2,0xc3,0x8e,0x8f,0x92,0x80,0xc8,0x90,0xca,
  179. 0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xa5,0xd2,0xd3,0xd4,0xd5,0x99,0xd7,0xd8,0xd9,
  180. 0xda,0xdb,0x9a,0xdd,0xde,0xdf,0x85,0xa0,0x83,0xe3,0x84,0x86,0x91,0x87,0x8a,
  181. 0x82,0x88,0x89,0x8d,0xa1,0x8c,0x8b,0xf0,0xa4,0x95,0xa2,0x93,0xf5,0x94,0xf6,
  182. 0xf8,0x97,0xa3,0x96,0x81,0xb2,0xfe,0x98
  183. };
  184.  
  185. unsigned char table_latin1_to_cp850[128]=
  186. {
  187. 0xc3,0xb3,0xda,0xc8,0xba,0xcb,0xd5,0xcc,0xdb,0xd9,0xfe,0xb4,0xee,0xb2,0xc4,
  188. 0xc5,0xc9,0xe6,0xca,0xc2,0xf7,0xf2,0xb9,0xbf,0xdf,0xcd,0xdc,0xb0,0xfa,0xb1,
  189. 0xce,0x9f,0xff,0xad,0xbd,0x9c,0xcf,0xbe,0xdd,0xf5,0xf9,0xb8,0xa6,0xae,0xaa,
  190. 0xf0,0xa9,0xbb,0xf8,0xf1,0xfd,0xfc,0xef,0xc1,0xf4,0xc0,0xbc,0xfb,0xa7,0xaf,
  191. 0xac,0xab,0xf3,0xa8,0xb7,0xb5,0xb6,0xc7,0x8e,0x8f,0x92,0x80,0xd4,0x90,0xd2,
  192. 0xd3,0xde,0xd6,0xd7,0xd8,0xd1,0xa5,0xe3,0xe0,0xe2,0xe5,0x99,0x9e,0x9d,0xeb,
  193. 0xe9,0xea,0x9a,0xed,0xe7,0xe1,0x85,0xa0,0x83,0xc6,0x84,0x86,0x91,0x87,0x8a,
  194. 0x82,0x88,0x89,0x8d,0xa1,0x8c,0x8b,0xd0,0xa4,0x95,0xa2,0x93,0xe4,0x94,0xf6,
  195. 0x9b,0x97,0xa3,0x96,0x81,0xec,0xe8,0x98
  196. };
  197.  
  198. unsigned char* translation_table=NULL;  /* default: no more translation */
  199.  
  200. unsigned char recode_char2(unsigned char c)
  201. {
  202.     if (translation_table == NULL       /* no translation requested */
  203.         || (int)c<128)                  /* common ASCII set */
  204.     {
  205.         return c;                       /* nothing to do */
  206.     }
  207.  
  208.     /* get mapped code from appropriate set translation table */
  209.     return translation_table[(int)c-128];
  210. }
  211.  
  212. /* global flag ---- */
  213. int nowrap=0;
  214. /* ............................................................. func ... */
  215. unsigned char *map_char(unsigned char **map,int c)
  216. {
  217.     static      unsigned char    buffer[2]="a";
  218.                 unsigned char    *ptr;
  219.  
  220.     if ( ( ptr = strchr( specs, c)) )
  221.     {
  222.         return map[ ptr - specs ];
  223.     }
  224.     else
  225.     {
  226.         buffer[0]=recode_char(c);
  227.         buffer[0]=recode_char2(buffer[0]);
  228.         return buffer;
  229.     }
  230. }
  231.  
  232.  
  233. /* ............................................................. func ... */
  234. void format( unsigned char *buf, unsigned char **map)
  235. {
  236.     unsigned char    outstring[128];
  237.     unsigned char    *sp = buf, *dp;
  238.     int     table = 0;
  239.  
  240.     outstring[0] = '\0';                        /* clear as "" */
  241.  
  242.     while (*sp)
  243.     {
  244.         if (*sp==7&&table)
  245.         {
  246.             printf("%s%s",outstring,map_char(map,'\n'));
  247.             outstring[0]=0;
  248.             table=0;sp++;
  249.         }
  250.         else
  251.         {   
  252.             if ( strlen( strcat( outstring, map_char( map ,*sp))) > TEXT_WIDTH)
  253.             { 
  254.                if (nowrap) {
  255.                  printf("%s",outstring);
  256.                  *outstring=0;
  257.                } else {
  258.                 dp = strrchr(outstring,' ');
  259.                 if (dp)
  260.                 {
  261.                     *(dp++)=0;
  262.                     printf("%s\n",outstring);
  263.                     strcpy(outstring,dp);
  264.                 }
  265.                 else
  266.                 {
  267.                     int i;
  268.                     for(i=0;i<TEXT_WIDTH;i++) putc(outstring[i],stdout);
  269.                     putc('\n',stdout);
  270.                     strcpy(outstring,outstring+72);
  271.                 }
  272.               }
  273.             }
  274.             table=*(sp++)==7;
  275.         }
  276.     }
  277.     if (nowrap) {
  278.        if (outstring[0]!=0) {
  279.           printf("%s\n", outstring);
  280.        }
  281.     } else {
  282.     if (outstring[0]==0)
  283.         putc('\n',stdout);
  284.     else
  285.         printf("%s\n\n",outstring);
  286.     }
  287. }
  288.  
  289. /* ............................................................. func ... */
  290. void help(void)
  291. {
  292.     printf(
  293.     "catdoc - exctract text from MS-Word files and concate it to stdout\n"
  294.     "Copyright (c) by Victor B. Wagner, 1996\n"
  295.     "Modified by Stefan Schwarzer, 1999\n"
  296.     "Usage catdoc [-ast48] files ...\n"
  297.     "\t-a - converts non-standard printable chars into readable form (default)\n"
  298.     "\t-t - converts them into TeX control sequences\n"
  299.     "\t-4 - converts latin1 output to cp437\n"
  300.     "\t-8 - converts latin1 output to cp850\n"
  301.     "\t-w - disables word wrapping\n"
  302.     "\t-s - exits with code 1 if MSWordDoc signature not found before\n"
  303.     "\t\tfirst printable paragraph\n\n"
  304.     "All options affects only files, specified AFTER them\n"
  305.            );
  306.     exit(2);
  307. }
  308.  
  309. /* ............................................................. func ... */
  310.  
  311. unsigned char buf[BUFFER_SIZE];
  312.  
  313. void do_file(FILE *f, unsigned char **map, int search_sign)
  314. {
  315.     int ok =! search_sign;
  316.     int bufptr, c;
  317.  
  318.     while( !feof(f) )
  319.     {
  320.         bufptr = -1;
  321.  
  322.         do {
  323.  
  324.             c = getc(f);
  325.  
  326.             /* Special printable symbols 7- table separator
  327.              *
  328.              * \r   - paragraph end
  329.              * 0x1E - short defis
  330.              *
  331.              */
  332.  
  333.             if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E)
  334.                 buf[++bufptr]=c;
  335.             else
  336.                 if (c==0x0b)
  337.                     buf[++bufptr]='\r';
  338.                 else
  339.                 {
  340.                     if (!c)
  341.                     {
  342.                         buf[++bufptr]=0;
  343.                         if(!strcmp(buf,"MSWordDoc"))
  344.                         {
  345.                             ok=1;
  346.                         }
  347.                     }
  348.                     if (c!=2)/* \002 is Word's footnote mark */
  349.                         bufptr=-1; /*all other special symbols
  350.                                      discard buffer */
  351.                 }
  352.         }
  353.  
  354.         while (c!='\r'&&c!=EOF);
  355.         if (c==EOF && !ok) exit(1);
  356.         if (bufptr>0&&buf[bufptr]=='\r')
  357.         {
  358.             if (!ok)
  359.                 exit( 1);
  360.             buf[bufptr]=0;
  361.             format(buf,map);
  362.         }
  363.     }
  364. }
  365.  
  366. /* ............................................................. func ... */
  367.  
  368. int main(int argc,char **argv)
  369. {
  370.     /* search_sign:
  371.      *     Must program exit with exit code 1 if MSWordDoc
  372.      *     signature is not found?
  373.      *
  374.      * sequences:
  375.      *     pointer to array of character sequences
  376.      *     to represent special characters of Word
  377.      */
  378.  
  379.  
  380.     int     search_sign = 0;
  381.     unsigned char    **sequences = ascii_specs;
  382.     int     i= 1,
  383.             stdin_processed=0;
  384.  
  385.     /* state variables for conversions latin1 -> cp437 and latin1 -> cp850 */
  386.  
  387.  
  388.     if (argc<2)
  389.     {
  390.         help();
  391.     }
  392.  
  393.     for(;i<argc;i++)
  394.     {
  395.         if (!strcmp(argv[i],"-s"))
  396.             search_sign=1;
  397.         else if (!strcmp(argv[i],"-t"))
  398.             sequences=TeX_specs;
  399.         else if (!strcmp(argv[i],"-4"))
  400.             translation_table=table_latin1_to_cp437;
  401.         else if (!strcmp(argv[i],"-8"))
  402.             translation_table=table_latin1_to_cp850;
  403.         else if (!strcmp(argv[i],"-a"))
  404.             sequences=ascii_specs;
  405.         else if (!strcmp(argv[i],"-w"))
  406.             nowrap=1;
  407.         else if (!strcmp(argv[i],"-"))
  408.             if (!stdin_processed)
  409.             {
  410.                 do_file(stdin,sequences,search_sign);
  411.                 stdin_processed=1;
  412.             }
  413.             else
  414.             {
  415.                 fprintf(stderr,"Cannot process standard input twice a row\n");
  416.                 exit (2);
  417.             }
  418.         else if (argv[i][0]=='-')
  419.         {
  420.             fprintf(stderr,"Invalid option %s\n",argv[i]);
  421.             help();
  422.         }
  423.         else
  424.         {
  425.             FILE *f=fopen(argv[i],"rb");
  426.             if(!f)
  427.             {
  428.                 fprintf(stderr,"Cannot open file %s\n",argv[i]);
  429.                 exit(2);
  430.             }
  431.             do_file(f,sequences,search_sign);
  432.         }
  433.     }
  434.     return 0;
  435. }
  436.  
  437. /* end of file */
  438.