home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
OS/2 Shareware BBS: 5 Edit
/
05-Edit.zip
/
anwor032.zip
/
antiword.0.32
/
findtext.c
< prev
next >
Wrap
C/C++ Source or Header
|
2001-06-11
|
7KB
|
294 lines
/*
* findtext.c
* Copyright (C) 1998-2001 A.J. van Os; Released under GPL
*
* Description:
* Find the blocks that contain the text of MS Word files
*/
#include <stdio.h>
#include <stdlib.h>
#include "antiword.h"
/*
* bAddTextBlocks - Add the blocks to the text block list
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bAddTextBlocks(long lFirstOffset, long lTotalLength,
BOOL bUsesUnicode, unsigned short usPropMod,
long lStartBlock, const long *alBBD, size_t tBBDLen)
{
text_block_type tTextBlock;
long lTextOffset, lIndex, lToGo, lOffset;
fail(lFirstOffset < 0);
fail(lStartBlock < 0);
fail(alBBD == NULL);
NO_DBG_HEX(lFirstOffset);
NO_DBG_DEC(lTotalLength);
if (bUsesUnicode) {
/* One character equals two bytes */
NO_DBG_MSG("Uses Unicode");
lToGo = lTotalLength * 2;
} else {
/* One character equals one byte */
NO_DBG_MSG("Uses ASCII");
lToGo = lTotalLength;
}
lTextOffset = lFirstOffset;
lOffset = lFirstOffset;
for (lIndex = lStartBlock;
lIndex != END_OF_CHAIN && lToGo > 0;
lIndex = alBBD[lIndex]) {
if (lIndex < 0 || lIndex >= (long)tBBDLen) {
werr(1, "The Big Block Depot is corrupt");
}
if (lOffset >= BIG_BLOCK_SIZE) {
lOffset -= BIG_BLOCK_SIZE;
continue;
}
tTextBlock.lFileOffset =
(lIndex + 1) * BIG_BLOCK_SIZE + lOffset;
tTextBlock.lTextOffset = lTextOffset;
tTextBlock.lLength = min(BIG_BLOCK_SIZE - lOffset, lToGo);
tTextBlock.bUsesUnicode = bUsesUnicode;
tTextBlock.usPropMod = usPropMod;
lOffset = 0;
if (!bAdd2TextBlockList(&tTextBlock)) {
DBG_HEX(tTextBlock.lFileOffset);
DBG_HEX(tTextBlock.lTextOffset);
DBG_DEC(tTextBlock.lLength);
DBG_DEC(tTextBlock.bUsesUnicode);
DBG_DEC(tTextBlock.usPropMod);
return FALSE;
}
lTextOffset += tTextBlock.lLength;
lToGo -= tTextBlock.lLength;
}
DBG_DEC_C(lToGo != 0, lToGo);
return lToGo == 0;
} /* end of bAddTextBlocks */
/*
* bGet6DocumentText - make a list of the text blocks of Word 6/7 files
*
* Code for "fast saved" files.
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, long lStartBlock,
const long *alBBD, size_t tBBDLen, const unsigned char *aucHeader)
{
unsigned char *aucBuffer;
long lTextOffset, lTotLength, lBeginTextInfo;
size_t tTextInfoLen, tOff;
int iIndex, iType, iLen, iPieces;
unsigned short usPropMod;
DBG_MSG("bGet6DocumentText");
fail(pFile == NULL);
fail(alBBD == NULL);
fail(aucHeader == NULL);
lBeginTextInfo = (long)ulGetLong(0x160, aucHeader); /* fcClx */
tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader); /* lcbClx */
DBG_HEX(lBeginTextInfo);
DBG_DEC(tTextInfoLen);
aucBuffer = xmalloc(tTextInfoLen);
if (!bReadBuffer(pFile, lStartBlock,
alBBD, tBBDLen, BIG_BLOCK_SIZE,
aucBuffer, lBeginTextInfo, tTextInfoLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
tOff = 0;
while (tOff < tTextInfoLen) {
iType = (int)ucGetByte(tOff, aucBuffer);
tOff++;
if (iType == 0) {
DBG_FIXME();
tOff++;
continue;
}
if (iType == 1) {
iLen = (int)usGetWord(tOff, aucBuffer);
vAdd2PropModList(aucBuffer + tOff);
tOff += iLen + 2;
continue;
}
if (iType != 2) {
werr(0, "Unknown type of 'fastsaved' format");
aucBuffer = xfree(aucBuffer);
return FALSE;
}
/* Type 2 */
iLen = (int)usGetWord(tOff, aucBuffer);
NO_DBG_DEC(iLen);
tOff += 4;
iPieces = (iLen - 4) / 12;
DBG_DEC(iPieces);
for (iIndex = 0; iIndex < iPieces; iIndex++) {
lTextOffset = (long)ulGetLong(
tOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
aucBuffer);
usPropMod = usGetWord(
tOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
aucBuffer);
lTotLength = (long)ulGetLong(
tOff + (iIndex + 1) * 4,
aucBuffer) -
(long)ulGetLong(
tOff + iIndex * 4,
aucBuffer);
NO_DBG_HEX_C(usPropMod != 0, usPropMod);
if (!bAddTextBlocks(lTextOffset, lTotLength,
bUsesUnicode, usPropMod,
lStartBlock,
alBBD, tBBDLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
}
break;
}
aucBuffer = xfree(aucBuffer);
return TRUE;
} /* end of bGet6DocumentText */
/*
* bGet8DocumentText - make a list of the text blocks of Word 8/97 files
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
const long *alBBD, size_t tBBDLen, const long *alSBD, size_t tSBDLen,
const unsigned char *aucHeader)
{
const long *alBlockDepot;
unsigned char *aucBuffer;
long lTextOffset, lTotLength, lBeginTextInfo, lLen;
long lTableStartBlock, lTableSize, lIndex, lPieces;
size_t tTextInfoLen, tBlockDepotLen, tBlockSize, tOff;
int iType, iLen;
BOOL bUsesUnicode;
unsigned short usDocStatus, usPropMod;
DBG_MSG("bGet8DocumentText");
fail(pFile == NULL || pPPS == NULL);
fail(alBBD == NULL || alSBD == NULL);
fail(aucHeader == NULL);
lBeginTextInfo = (long)ulGetLong(0x1a2, aucHeader); /* fcClx */
tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader); /* lcbClx */
DBG_HEX(lBeginTextInfo);
DBG_DEC(tTextInfoLen);
/* Use 0Table or 1Table? */
usDocStatus = usGetWord(0x0a, aucHeader);
if (usDocStatus & BIT(9)) {
lTableStartBlock = pPPS->t1Table.lSb;
lTableSize = pPPS->t1Table.lSize;
} else {
lTableStartBlock = pPPS->t0Table.lSb;
lTableSize = pPPS->t0Table.lSize;
}
DBG_DEC(lTableStartBlock);
if (lTableStartBlock < 0) {
DBG_DEC(lTableStartBlock);
return FALSE;
}
DBG_HEX(lTableSize);
if (lTableSize < MIN_SIZE_FOR_BBD_USE) {
/* Use the Small Block Depot */
alBlockDepot = alSBD;
tBlockDepotLen = tSBDLen;
tBlockSize = SMALL_BLOCK_SIZE;
} else {
/* Use the Big Block Depot */
alBlockDepot = alBBD;
tBlockDepotLen = tBBDLen;
tBlockSize = BIG_BLOCK_SIZE;
}
aucBuffer = xmalloc(tTextInfoLen);
if (!bReadBuffer(pFile, lTableStartBlock,
alBlockDepot, tBlockDepotLen, tBlockSize,
aucBuffer, lBeginTextInfo, tTextInfoLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
tOff = 0;
while (tOff < tTextInfoLen) {
iType = (int)ucGetByte(tOff, aucBuffer);
tOff++;
if (iType == 0) {
DBG_FIXME();
tOff++;
continue;
}
if (iType == 1) {
iLen = (int)usGetWord(tOff, aucBuffer);
vAdd2PropModList(aucBuffer + tOff);
tOff += iLen + 2;
continue;
}
if (iType != 2) {
werr(0, "Unknown type of 'fastsaved' format");
aucBuffer = xfree(aucBuffer);
return FALSE;
}
/* Type 2 */
lLen = (long)ulGetLong(tOff, aucBuffer);
NO_DBG_DEC(lLen);
tOff += 4;
lPieces = (lLen - 4) / 12;
DBG_DEC(lPieces);
for (lIndex = 0; lIndex < lPieces; lIndex++) {
lTextOffset = (long)ulGetLong(
tOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
aucBuffer);
usPropMod = usGetWord(
tOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
aucBuffer);
lTotLength = (long)ulGetLong(
tOff + (lIndex + 1) * 4,
aucBuffer) -
(long)ulGetLong(
tOff + lIndex * 4,
aucBuffer);
if ((lTextOffset & BIT(30)) == 0) {
bUsesUnicode = TRUE;
} else {
bUsesUnicode = FALSE;
lTextOffset &= ~BIT(30);
lTextOffset /= 2;
}
DBG_HEX_C(usPropMod != 0, usPropMod);
if (!bAddTextBlocks(lTextOffset, lTotLength,
bUsesUnicode, usPropMod,
pPPS->tWordDocument.lSb,
alBBD, tBBDLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
}
break;
}
aucBuffer = xfree(aucBuffer);
return TRUE;
} /* end of bGet8DocumentText */