home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 5 Edit
/
05-Edit.zip
/
catdo_35.zip
/
catdoc0_35s.c
< prev
next >
Wrap
C/C++ Source or Header
|
1999-12-17
|
15KB
|
438 lines
/* .................................................... documentation ...
*
* You probably want to #define LATIN1 (commented out), otherwise you
* get cyrillic code page translations. I think that's the only
* user-servicable part. The author's original message follows:
*
*
* Usage notes
*
* -t switch causes replacing of special symbols such as em-dash by
* TeX (LaTeX) commands instead of ASCII printable equivalents
* -a disables effect of prevouisly specified -t
*
* -w disables wordwrap - prints paragraphs as long lines.
*
*
* -s switch: if program cannot find MS-Word signature before
* first pritable paragraph, it exits with code 1, supposing that it is
* just plain text which has .doc suffix only by coincedence.
*
* -------------------------------------------------------------------
*
* This file has been modified by Stefan Schwarzer <s.schwarzer@ndh.net>
* to allow for successive conversion from latin1 to cp437 or cp850.
* This is accomplished by using one (more) table translation if options
* -4 (convert to cp437) or -8 (convert to cp850) are selected. Default
* is to use none of these translations.
*
* The tables were generated by recode latin1:cp437 and recode latin1:cp850,
* respectively, and a small python script.
*/
/* .................................................... program start ... */
/* catdoc.c version 0.3
*
* $Id: catdoc.c,v 0.35 1998/06/05 14:07:08 vitus Exp vitus $
*
*
*/
/* .......................................................... include ... */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/* ........................................................... define ... */
#define TEXT_WIDTH 72
#if (defined unix) || (defined OS2)
#define BUFFER_SIZE 262144
#else
#define BUFFER_SIZE 16384
#endif
/* enable this define, if you don't want cyrillic code page translations */
#define LATIN1
/* ......................................................... charsets ... */
unsigned char specs[] =
{ 7, /* tab columns separator - handled specially*/
'\n',/* hook to handle end of line in tables */
0x1E,/* unbreakable defis */
0x1F,/* soft hyphen */
0x85,/* dots */
0x91,/* opening single quote */
0x92,/* closing single quote */
0x84,/* opening double quote */
/* 0x93,/* opening double quote */
0x93,/* closing double quote */
/* 0x94,/* closing double quote */
0x96,/* em-dash (or em-space)*/
0x97,/* en-dash */
0x99,/* Trade Mark sign */
0xA0,/* unbreakable space */
0xA9,/* Copyright sign */
0xAE,/* Reserved sign */
0xAB,/* opening << quote*/
0xBB,/* closing >> quote*/
'\r',/* Ignore paragraph end in tables*/
/* The rest is translated into itself unless TeX mode is selected */
'%','$','_','{','}','\\',
0 /* To terminate the string, because I'm using strchr to search in it*/
};
unsigned char *ascii_specs[]=
{
"\t","\n","-","","...","`","'","``","''","-","-","tm",
" ","(c)","(R)","\"","\""," ","%","$","_","{","}","\\",
0
};
unsigned char *TeX_specs[]=
{
"\t&","\\\\\n","-","\\-","\\dots{}","`","'","``","''","---","--",
"${}^{\\scriptscriptstyle\\mathrm{TM}}$",/* this is my idea about tm sign*/
"~",
"{\\copyright}",
"\\circledchar{R}",/* specific to teTeX */
"<",">", /* specific to Urbansoft teTeX russification */
" ",
"\\%","\\$","\\_","$\\{$","$\\}$","$\\backslash$",
0
};
/*********************************************************************/
/** code_page translation **/
/*********************************************************************/
#ifndef LATIN1
#ifdef unix
unsigned char table[256]=
{
/* Windows cyrillic code page to KOI-8 */
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0D,0x0C,0x0D,0x0E,0x0F,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x2D,0x20,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
0x80,0x81,0x82,0xAA,0x8F,0x90,0xA9,0x93,0x84,0x92,0x91,0x94,0x83,0x95,0x99,0x8B,
0x98,0x60,0x27,0x22,0x22,0x9A,0x2D,0x2D,0x9E,0xA6,0x87,0xB0,0x8D,0x97,0x86,0xA2,
0x20,0xA7,0xA5,0x88,0xA4,0x8E,0x96,0x85,0xB3,0xA1,0x9F,0x22,0xAB,0xAC,0xAD,0xAE,
0xAF,0xB2,0xB1,'i',0xB5,0xB6,0xB7,0xB8,0xA3,0xB9,0xBA,0x22,0xBC,0xBD,0xBE,0x9B,
0xE1,0xE2,0xF7,0xE7,0xE4,0xE5,0xF6,0xFA,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xF0,
0xF2,0xF3,0xF4,0xF5,0xE6,0xE8,0xE3,0xFE,0xFB,0xFD,0xFF,0xF9,0xF8,0xFC,0xE0,0xF1,
0xC1,0xC2,0xD7,0xC7,0xC4,0xC5,0xD6,0xDA,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xD0,
0xD2,0xD3,0xD4,0xD5,0xC6,0xC8,0xC3,0xDE,0xDB,0xDD,0xDF,0xD9,0xD8,0xDC,0xC0,0xD1};
#else
unsigned char table[256]=
{
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0D,0x0c,0x0d,0x0e,0x0f,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x2D,0x20,
0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x60,0x27,0x22,0x22,0x95,0x2D,0x2D,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0x20,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0x22,0xac,0xad,0xae,0xaf,
0xb0,0xb1,0xb2,0xb3,'i',0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0x22,0xbc,0xbd,0xbe,0xbf,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef};
#endif
#define recode_char(x) table[x]
#else
/* LATIN1 defined */
#define recode_char(x) x
#endif
/* We need only ASCII codes >= 128, the characters below 128 are the same for
* latin1, cp437, and cp850.
*/
unsigned char table_latin1_to_cp437[128]=
{
0xc7,0xfc,0xe9,0xe2,0xe4,0xe0,0xe5,0xe7,0xea,0xeb,0xe8,0xef,0xee,0xec,0xc4,
0xc5,0xc9,0xe6,0xc6,0xf4,0xf7,0xf2,0xfb,0xb7,0xe1,0xd6,0xdc,0xf3,0xfa,0xd1,
0x9e,0x9f,0xff,0xad,0x9b,0x9c,0xb1,0x9d,0xbc,0xba,0xbf,0xa9,0xa6,0xae,0xaa,
0xed,0xbd,0xbb,0xb0,0xf1,0xfd,0xb3,0xb4,0xb5,0xb6,0xf9,0xb8,0xb9,0xa7,0xaf,
0xac,0xab,0xbe,0xa8,0xc0,0xc1,0xc2,0xc3,0x8e,0x8f,0x92,0x80,0xc8,0x90,0xca,
0xcb,0xcc,0xcd,0xce,0xcf,0xd0,0xa5,0xd2,0xd3,0xd4,0xd5,0x99,0xd7,0xd8,0xd9,
0xda,0xdb,0x9a,0xdd,0xde,0xdf,0x85,0xa0,0x83,0xe3,0x84,0x86,0x91,0x87,0x8a,
0x82,0x88,0x89,0x8d,0xa1,0x8c,0x8b,0xf0,0xa4,0x95,0xa2,0x93,0xf5,0x94,0xf6,
0xf8,0x97,0xa3,0x96,0x81,0xb2,0xfe,0x98
};
unsigned char table_latin1_to_cp850[128]=
{
0xc3,0xb3,0xda,0xc8,0xba,0xcb,0xd5,0xcc,0xdb,0xd9,0xfe,0xb4,0xee,0xb2,0xc4,
0xc5,0xc9,0xe6,0xca,0xc2,0xf7,0xf2,0xb9,0xbf,0xdf,0xcd,0xdc,0xb0,0xfa,0xb1,
0xce,0x9f,0xff,0xad,0xbd,0x9c,0xcf,0xbe,0xdd,0xf5,0xf9,0xb8,0xa6,0xae,0xaa,
0xf0,0xa9,0xbb,0xf8,0xf1,0xfd,0xfc,0xef,0xc1,0xf4,0xc0,0xbc,0xfb,0xa7,0xaf,
0xac,0xab,0xf3,0xa8,0xb7,0xb5,0xb6,0xc7,0x8e,0x8f,0x92,0x80,0xd4,0x90,0xd2,
0xd3,0xde,0xd6,0xd7,0xd8,0xd1,0xa5,0xe3,0xe0,0xe2,0xe5,0x99,0x9e,0x9d,0xeb,
0xe9,0xea,0x9a,0xed,0xe7,0xe1,0x85,0xa0,0x83,0xc6,0x84,0x86,0x91,0x87,0x8a,
0x82,0x88,0x89,0x8d,0xa1,0x8c,0x8b,0xd0,0xa4,0x95,0xa2,0x93,0xe4,0x94,0xf6,
0x9b,0x97,0xa3,0x96,0x81,0xec,0xe8,0x98
};
unsigned char* translation_table=NULL; /* default: no more translation */
unsigned char recode_char2(unsigned char c)
{
if (translation_table == NULL /* no translation requested */
|| (int)c<128) /* common ASCII set */
{
return c; /* nothing to do */
}
/* get mapped code from appropriate set translation table */
return translation_table[(int)c-128];
}
/* global flag ---- */
int nowrap=0;
/* ............................................................. func ... */
unsigned char *map_char(unsigned char **map,int c)
{
static unsigned char buffer[2]="a";
unsigned char *ptr;
if ( ( ptr = strchr( specs, c)) )
{
return map[ ptr - specs ];
}
else
{
buffer[0]=recode_char(c);
buffer[0]=recode_char2(buffer[0]);
return buffer;
}
}
/* ............................................................. func ... */
void format( unsigned char *buf, unsigned char **map)
{
unsigned char outstring[128];
unsigned char *sp = buf, *dp;
int table = 0;
outstring[0] = '\0'; /* clear as "" */
while (*sp)
{
if (*sp==7&&table)
{
printf("%s%s",outstring,map_char(map,'\n'));
outstring[0]=0;
table=0;sp++;
}
else
{
if ( strlen( strcat( outstring, map_char( map ,*sp))) > TEXT_WIDTH)
{
if (nowrap) {
printf("%s",outstring);
*outstring=0;
} else {
dp = strrchr(outstring,' ');
if (dp)
{
*(dp++)=0;
printf("%s\n",outstring);
strcpy(outstring,dp);
}
else
{
int i;
for(i=0;i<TEXT_WIDTH;i++) putc(outstring[i],stdout);
putc('\n',stdout);
strcpy(outstring,outstring+72);
}
}
}
table=*(sp++)==7;
}
}
if (nowrap) {
if (outstring[0]!=0) {
printf("%s\n", outstring);
}
} else {
if (outstring[0]==0)
putc('\n',stdout);
else
printf("%s\n\n",outstring);
}
}
/* ............................................................. func ... */
void help(void)
{
printf(
"catdoc - exctract text from MS-Word files and concate it to stdout\n"
"Copyright (c) by Victor B. Wagner, 1996\n"
"Modified by Stefan Schwarzer, 1999\n"
"Usage catdoc [-ast48] files ...\n"
"\t-a - converts non-standard printable chars into readable form (default)\n"
"\t-t - converts them into TeX control sequences\n"
"\t-4 - converts latin1 output to cp437\n"
"\t-8 - converts latin1 output to cp850\n"
"\t-w - disables word wrapping\n"
"\t-s - exits with code 1 if MSWordDoc signature not found before\n"
"\t\tfirst printable paragraph\n\n"
"All options affects only files, specified AFTER them\n"
);
exit(2);
}
/* ............................................................. func ... */
unsigned char buf[BUFFER_SIZE];
void do_file(FILE *f, unsigned char **map, int search_sign)
{
int ok =! search_sign;
int bufptr, c;
while( !feof(f) )
{
bufptr = -1;
do {
c = getc(f);
/* Special printable symbols 7- table separator
*
* \r - paragraph end
* 0x1E - short defis
*
*/
if ((c<=255&&c>=32)||c==7||c=='\t'||c=='\r'||c==0x1E)
buf[++bufptr]=c;
else
if (c==0x0b)
buf[++bufptr]='\r';
else
{
if (!c)
{
buf[++bufptr]=0;
if(!strcmp(buf,"MSWordDoc"))
{
ok=1;
}
}
if (c!=2)/* \002 is Word's footnote mark */
bufptr=-1; /*all other special symbols
discard buffer */
}
}
while (c!='\r'&&c!=EOF);
if (c==EOF && !ok) exit(1);
if (bufptr>0&&buf[bufptr]=='\r')
{
if (!ok)
exit( 1);
buf[bufptr]=0;
format(buf,map);
}
}
}
/* ............................................................. func ... */
int main(int argc,char **argv)
{
/* search_sign:
* Must program exit with exit code 1 if MSWordDoc
* signature is not found?
*
* sequences:
* pointer to array of character sequences
* to represent special characters of Word
*/
int search_sign = 0;
unsigned char **sequences = ascii_specs;
int i= 1,
stdin_processed=0;
/* state variables for conversions latin1 -> cp437 and latin1 -> cp850 */
if (argc<2)
{
help();
}
for(;i<argc;i++)
{
if (!strcmp(argv[i],"-s"))
search_sign=1;
else if (!strcmp(argv[i],"-t"))
sequences=TeX_specs;
else if (!strcmp(argv[i],"-4"))
translation_table=table_latin1_to_cp437;
else if (!strcmp(argv[i],"-8"))
translation_table=table_latin1_to_cp850;
else if (!strcmp(argv[i],"-a"))
sequences=ascii_specs;
else if (!strcmp(argv[i],"-w"))
nowrap=1;
else if (!strcmp(argv[i],"-"))
if (!stdin_processed)
{
do_file(stdin,sequences,search_sign);
stdin_processed=1;
}
else
{
fprintf(stderr,"Cannot process standard input twice a row\n");
exit (2);
}
else if (argv[i][0]=='-')
{
fprintf(stderr,"Invalid option %s\n",argv[i]);
help();
}
else
{
FILE *f=fopen(argv[i],"rb");
if(!f)
{
fprintf(stderr,"Cannot open file %s\n",argv[i]);
exit(2);
}
do_file(f,sequences,search_sign);
}
}
return 0;
}
/* end of file */