home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Geek Gadgets 1
/
ADE-1.bin
/
ade-dist
/
id-utils-3.2-src.tgz
/
tar.out
/
fsf
/
id-utils
/
src
/
mkid.c
< prev
next >
Wrap
C/C++ Source or Header
|
1996-09-28
|
22KB
|
840 lines
/* mkid.c -- build an identifer database
Copyright (C) 1986, 1995, 1996 Free Software Foundation, Inc.
Written by Greg McGary <gkm@gnu.ai.mit.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#include <config.h>
#include "xstdlib.h"
#include <assert.h>
#include <stdio.h>
#include <errno.h>
#include <getopt.h>
#include "xsysstat.h"
#include "xstddef.h"
#include "xunistd.h"
#include "xnls.h"
#include "pathmax.h"
#include "xstring.h"
#include "idfile.h"
#include "xmalloc.h"
#include "hash.h"
#include "scanners.h"
#include "error.h"
#include "xalloca.h"
#if HAVE_LIMITS_H
# include <limits.h>
#endif
struct summary
{
struct token **sum_tokens;
unsigned char const *sum_hits;
struct summary *sum_parent;
union {
struct summary *u_kids[8]; /* when sum_level > 0 */
#define sum_kids sum_u.u_kids
struct member_file *u_files[8]; /* when sum_level == 0 */
#define sum_files sum_u.u_files
} sum_u;
unsigned long sum_tokens_size;
unsigned long sum_hits_count;
int sum_free_index;
int sum_level;
};
void usage __P((void));
static void help_me __P((void));
int main __P((int argc, char **argv));
void assert_writeable __P((char const *file_name));
void scan_files __P((struct idhead *idhp));
void scan_member_file __P((struct member_file const *member));
void scan_member_file_1 __P((get_token_func_t get_token, void const *args, FILE *source_FILE));
void report_statistics __P((void));
void write_id_file __P((struct idhead *idhp));
unsigned long token_hash_1 __P((void const *key));
unsigned long token_hash_2 __P((void const *key));
int token_hash_cmp __P((void const *x, void const *y));
int token_qsort_cmp __P((void const *x, void const *y));
void bump_current_hits_signature __P((void));
void init_hits_signature __P((int i));
void free_summary_tokens __P((void));
void summarize __P((void));
void init_summary __P((void));
struct summary *make_sibling_summary __P((struct summary *summary));
int count_vec_size __P((struct summary *summary, unsigned char const *tail_hits));
int count_buf_size __P((struct summary *summary, unsigned char const *tail_hits));
void assert_hits __P((struct summary* summary));
void write_hits __P((FILE *fp, struct summary *summary, unsigned char const *tail_hits));
void sign_token __P((struct token *token));
void add_token_to_summary __P((struct summary *summary, struct token *token));
struct hash_table token_table;
/* Miscellaneous statistics */
size_t input_chars;
size_t name_tokens;
size_t number_tokens;
size_t string_tokens;
size_t literal_tokens;
size_t comment_tokens;
size_t occurrences;
size_t hits_length = 0;
size_t tokens_length = 0;
size_t output_length = 0;
int verbose_flag = 0;
int statistics_flag = 0;
int file_name_count = 0; /* # of files in database */
int levels = 0; /* ceil(log(8)) of file_name_count */
unsigned char current_hits_signature[MAX_LEVELS];
#define INIT_TOKENS_SIZE(level) (1 << ((level) + 13))
struct summary *summary_root;
struct summary *summary_leaf;
char const *program_name;
char *lang_map_file_name = 0;
int show_version = 0;
int show_help = 0;
struct idhead idh;
struct file_link *cw_dlink;
void
usage (void)
{
fprintf (stderr, _("Try `%s --help' for more information.\n"),
program_name);
exit (1);
}
static struct option const long_options[] =
{
{ "file", required_argument, 0, 'f' },
{ "output", required_argument, 0, 'o' },
{ "include", required_argument, 0, 'i' },
{ "exclude", required_argument, 0, 'x' },
{ "lang-option", required_argument, 0, 'l' },
{ "lang-map", required_argument, 0, 'm' },
{ "default-lang", required_argument, 0, 'd' },
{ "prune", required_argument, 0, 'p' },
{ "verbose", no_argument, 0, 'v' },
{ "statistics", no_argument, 0, 's' },
{ "help", no_argument, &show_help, 1 },
{ "version", no_argument, &show_version, 1 },
{ 0 }
};
static void
help_me (void)
{
printf (_("\
Usage: %s [OPTION]... [FILE]...\n\
"), program_name);
printf (_("\
Build an identifier database.\n\
-o, --output=OUTFILE file name of ID database output\n\
-f, --file=OUTFILE synonym for --output\n\
-i, --include=LANGS include languages in LANGS (default: \"C C++ asm\")\n\
-x, --exclude=LANGS exclude languages in LANGS\n\
-l, --lang-option=L:OPT pass OPT as a default for language L (see below)\n\
-m, --lang-map=MAPFILE use MAPFILE to map file names onto source language\n\
-d, --default-lang=LANG make LANG the default source language\n\
-p, --prune=NAMES exclude the named files and/or directories\n\
-v, --verbose report per file statistics\n\
-s, --statistics report statistics at end of run\n\
\n\
--help display this help and exit\n\
--version output version information and exit\n\
\n\
FILE may be a file name, or a directory name to recursively search.\n\
If no FILE is given, the current directory is searched by default.\n\
Note that the `--include' and `--exclude' options are mutually-exclusive.\n\
\n\
The following arguments apply to the language-specific scanners:\n\
"));
language_help_me ();
exit (0);
}
#ifdef HAVE_SBRK
char const *heap_initial;
char const *heap_after_walk;
char const *heap_after_scan;
#endif
int
main (int argc, char **argv)
{
program_name = argv[0];
#ifdef HAVE_SBRK
heap_initial = (char const *) sbrk (0);
#endif
idh.idh_file_name = DEFAULT_ID_FILE_NAME;
/* Set locale according to user's wishes. */
setlocale (LC_ALL, "");
/* Tell program which translations to use and where to find. */
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
for (;;)
{
int optc = getopt_long (argc, argv, "o:f:i:x:l:m:d:p:vs",
long_options, (int *) 0);
if (optc < 0)
break;
switch (optc)
{
case 0:
break;
case 'o':
case 'f':
idh.idh_file_name = optarg;
break;
case 'i':
include_languages (optarg);
break;
case 'x':
exclude_languages (optarg);
break;
case 'l':
language_save_arg (optarg);
break;
case 'm':
lang_map_file_name = optarg;
break;
case 'd':
set_default_language (optarg);
break;
case 'p':
if (cw_dlink == 0)
cw_dlink = init_walker (&idh);
prune_file_names (optarg, cw_dlink);
break;
case 'v':
verbose_flag = 1;
statistics_flag = 1;
break;
case 's':
statistics_flag = 1;
break;
default:
usage ();
}
}
if (show_version)
{
printf ("%s - %s\n", program_name, PACKAGE_VERSION);
exit (0);
}
if (show_help)
help_me ();
argc -= optind;
argv += optind;
if (argc == 0)
{
static char *dot = (char *) ".";
argc = 1;
argv = ˙
}
language_getopt ();
assert_writeable (idh.idh_file_name);
if (cw_dlink == 0)
cw_dlink = init_walker (&idh);
parse_language_map (lang_map_file_name);
while (argc--)
{
struct file_link *flink = parse_file_name (*argv++, cw_dlink);
if (flink)
walk_flink (flink, 0);
}
#ifdef HAVE_SBRK
heap_after_walk = (char const *) sbrk (0);
#endif
mark_member_file_links (&idh);
if (idh.idh_member_file_table.ht_fill)
{
scan_files (&idh);
#ifdef HAVE_SBRK
heap_after_scan = sbrk (0);
#endif
free_summary_tokens ();
free (token_table.ht_vec);
chdir_to_link (cw_dlink);
write_id_file (&idh);
if (statistics_flag)
report_statistics ();
}
else
error (0, 0, "nothing to do");
exit (0);
}
void
assert_writeable (char const *file_name)
{
if (access (file_name, 06) < 0)
{
if (errno == ENOENT)
{
char const *dir_name = dirname (file_name);
if (!dir_name || !*dir_name)
dir_name = ".";
if (access (dir_name, 06) < 0)
error (1, errno, _("can't create `%s' in `%s'"),
basename (file_name), dir_name);
}
else
error (1, errno, _("can't modify `%s'"), file_name);
}
}
void
scan_files (struct idhead *idhp)
{
struct member_file **members_0
= (struct member_file **) hash_dump (&idhp->idh_member_file_table,
0, member_file_qsort_compare);
struct member_file **end = &members_0[idhp->idh_member_file_table.ht_fill];
struct member_file **members = members_0;
hash_init (&token_table, idhp->idh_member_file_table.ht_fill * 64,
token_hash_1, token_hash_2, token_hash_cmp);
init_hits_signature (0);
init_summary ();
obstack_init (&tokens_obstack);
for (;;)
{
struct member_file *member = *members++;
scan_member_file (member);
if (members == end)
break;
if (current_hits_signature[0] & 0x80)
summarize ();
bump_current_hits_signature ();
}
free (members_0);
}
void
scan_member_file (struct member_file const *member)
{
struct lang_args const *lang_args = member->mf_lang_args;
struct language const *lang = lang_args->la_language;
get_token_func_t get_token = lang->lg_get_token;
struct file_link *flink = member->mf_link;
struct stat st;
FILE *source_FILE;
chdir_to_link (flink->fl_parent);
source_FILE = fopen (flink->fl_name, "r");
if (source_FILE)
{
if (statistics_flag)
{
if (fstat (fileno (source_FILE), &st) < 0)
{
char *file_name = ALLOCA (char, PATH_MAX);
maybe_relative_file_name (file_name, flink, cw_dlink);
error (0, errno, _("can't stat `%s'"), file_name);
}
else
input_chars += st.st_size;
}
if (verbose_flag)
{
char *file_name = ALLOCA (char, PATH_MAX);
maybe_relative_file_name (file_name, flink, cw_dlink);
printf ("%d: %s: %s", member->mf_index, lang->lg_name, file_name);
fflush (stdout);
}
scan_member_file_1 (get_token, lang_args->la_args_digested, source_FILE);
if (verbose_flag)
putchar ('\n');
fclose (source_FILE);
}
else
error (0, errno, _("can't open `%s'"), flink->fl_name);
}
void
scan_member_file_1 (get_token_func_t get_token, void const *args, FILE *source_FILE)
{
struct token **slot;
struct token *token;
int flags;
int new_tokens = 0;
int distinct_tokens = 0;
while ((token = (*get_token) (source_FILE, args, &flags)) != NULL)
{
if (*token->tok_name == '\0') {
obstack_free (&tokens_obstack, token);
continue;
}
slot = (struct token **) hash_find_slot (&token_table, token);
if (HASH_VACANT (*slot))
{
token->tok_flags = flags;
token->tok_count = 1;
memset (token->tok_hits, 0, sizeof (token->tok_hits));
sign_token (token);
if (verbose_flag)
{
distinct_tokens++;
new_tokens++;
}
hash_insert_at (&token_table, token, slot);
}
else
{
obstack_free (&tokens_obstack, token);
token = *slot;
token->tok_flags |= flags;
if (token->tok_count < USHRT_MAX)
token->tok_count++;
if (!(token->tok_hits[0] & current_hits_signature[0]))
{
sign_token (token);
if (verbose_flag)
distinct_tokens++;
}
}
}
if (verbose_flag)
{
printf (_(" new = %d/%d"), new_tokens, distinct_tokens);
if (distinct_tokens != 0)
printf (" = %.0f%%", 100.0 * (double) new_tokens / (double) distinct_tokens);
}
}
void
report_statistics (void)
{
printf (_("Name=%ld, "), name_tokens);
printf (_("Number=%ld, "), number_tokens);
printf (_("String=%ld, "), string_tokens);
printf (_("Literal=%ld, "), literal_tokens);
printf (_("Comment=%ld\n"), comment_tokens);
printf (_("Files=%d, "), idh.idh_files);
printf (_("Tokens=%ld, "), occurrences);
printf (_("Bytes=%ld Kb, "), input_chars / 1024);
#ifdef HAVE_SBRK
printf (_("Heap=%ld+%ld Kb, "), (heap_after_scan - heap_after_walk) / 1024,
(heap_after_walk - heap_initial) / 1024);
#endif
printf (_("Output=%ld (%ld tok, %ld hit)\n"), output_length, tokens_length, hits_length);
hash_print_stats (&token_table, stdout);
printf (_(", Freq=%ld/%ld=%.2f\n"), occurrences, token_table.ht_fill,
(double) occurrences / (double) token_table.ht_fill);
}
/* As the database is written, may need to adjust the file names. If
we are generating the ID file in a remote directory, then adjust
the file names to be relative to the location of the ID database.
(This would be a common useage if you want to make a database for a
directory which you have no write access to, so you cannot create
the ID file.) */
void
write_id_file (struct idhead *idhp)
{
struct token **tokens;
int i;
int buf_size;
int vec_size;
int tok_size;
int max_buf_size = 0;
int max_vec_size = 0;
if (verbose_flag)
printf (_("Sorting tokens...\n"));
assert (summary_root->sum_hits_count == token_table.ht_fill);
tokens = REALLOC (summary_root->sum_tokens, struct token *, token_table.ht_fill);
qsort (tokens, token_table.ht_fill, sizeof (struct token *), token_qsort_cmp);
if (verbose_flag)
printf (_("Writing `%s'...\n"), idhp->idh_file_name);
idhp->idh_FILE = fopen (idhp->idh_file_name, "w+b");
if (idhp->idh_FILE == NULL)
error (1, errno, _("can't create `%s'"), idhp->idh_file_name);
idhp->idh_magic[0] = IDH_MAGIC_0;
idhp->idh_magic[1] = IDH_MAGIC_1;
idhp->idh_version = IDH_VERSION;
idhp->idh_flags = IDH_COUNTS;
/* write out the list of pathnames */
fseek (idhp->idh_FILE, sizeof_idhead (), 0);
idhp->idh_flinks_offset = ftell (idhp->idh_FILE);
serialize_file_links (idhp);
/* write out the list of identifiers */
putc ('\0', idhp->idh_FILE);
putc ('\0', idhp->idh_FILE);
idhp->idh_tokens_offset = ftell (idhp->idh_FILE);
for (i = 0; i < token_table.ht_fill; i++, tokens++)
{
struct token *token = *tokens;
occurrences += token->tok_count;
if (token->tok_flags & TOK_NUMBER)
number_tokens++;
if (token->tok_flags & TOK_NAME)
name_tokens++;
if (token->tok_flags & TOK_STRING)
string_tokens++;
if (token->tok_flags & TOK_LITERAL)
literal_tokens++;
if (token->tok_flags & TOK_COMMENT)
comment_tokens++;
fputs (token->tok_name, idhp->idh_FILE);
putc ('\0', idhp->idh_FILE);
if (token->tok_count > 0xff)
token->tok_flags |= TOK_SHORT_COUNT;
putc (token->tok_flags, idhp->idh_FILE);
putc (token->tok_count & 0xff, idhp->idh_FILE);
if (token->tok_flags & TOK_SHORT_COUNT)
putc (token->tok_count >> 8, idhp->idh_FILE);
vec_size = count_vec_size (summary_root, token->tok_hits + levels);
buf_size = count_buf_size (summary_root, token->tok_hits + levels);
hits_length += buf_size;
tok_size = strlen (token->tok_name) + 1;
tokens_length += tok_size;
buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2;
if (buf_size > max_buf_size)
max_buf_size = buf_size;
if (vec_size > max_vec_size)
max_vec_size = vec_size;
write_hits (idhp->idh_FILE, summary_root, token->tok_hits + levels);
putc ('\0', idhp->idh_FILE);
putc ('\0', idhp->idh_FILE);
}
assert_hits (summary_root);
idhp->idh_tokens = token_table.ht_fill;
output_length = ftell (idhp->idh_FILE);
idhp->idh_end_offset = output_length - 2;
idhp->idh_buf_size = max_buf_size;
idhp->idh_vec_size = max_vec_size;
write_idhead (&idh);
fclose (idhp->idh_FILE);
}
unsigned long
token_hash_1 (void const *key)
{
return_STRING_HASH_1 (((struct token const *) key)->tok_name);
}
unsigned long
token_hash_2 (void const *key)
{
return_STRING_HASH_2 (((struct token const *) key)->tok_name);
}
int
token_hash_cmp (void const *x, void const *y)
{
return_STRING_COMPARE (((struct token const *) x)->tok_name,
((struct token const *) y)->tok_name);
}
int
token_qsort_cmp (void const *x, void const *y)
{
return_STRING_COMPARE ((*(struct token const *const *) x)->tok_name,
(*(struct token const *const *) y)->tok_name);
}
/****************************************************************************/
void
bump_current_hits_signature (void)
{
unsigned char *hits = current_hits_signature;
while (*hits & 0x80)
*hits++ = 1;
*hits <<= 1;
}
void
init_hits_signature (int i)
{
unsigned char *hits = current_hits_signature;
unsigned char const *end = ¤t_hits_signature[MAX_LEVELS];
while (hits < end)
{
*hits = 1 << (i & 7);
i >>= 3;
hits++;
}
}
void
free_summary_tokens (void)
{
struct summary *summary = summary_leaf;
while (summary != summary_root)
{
free (summary->sum_tokens);
summary = summary->sum_parent;
}
}
void
summarize (void)
{
unsigned char const *hits_sig = current_hits_signature;
struct summary *summary = summary_leaf;
do
{
unsigned long count = summary->sum_hits_count;
unsigned char *hits = MALLOC (unsigned char, count + 1);
unsigned int level = summary->sum_level;
struct token **tokens = summary->sum_tokens;
unsigned long init_size = INIT_TOKENS_SIZE (summary->sum_level);
if (verbose_flag)
printf (_("level %d: %ld/%ld = %.0f%%\n"),
summary->sum_level, count, init_size,
100.0 * (double) count / (double) init_size);
qsort (tokens, count, sizeof (struct token *), token_qsort_cmp);
summary->sum_hits = hits;
while (count--)
{
unsigned char *hit = &(*tokens++)->tok_hits[level];
*hits++ = *hit;
*hit = 0;
}
*hits++ = 0;
if (summary->sum_parent)
{
free (summary->sum_tokens);
summary->sum_tokens = 0;
}
summary = summary->sum_parent;
}
while (*++hits_sig & 0x80);
summary_leaf = make_sibling_summary (summary_leaf);
}
void
init_summary (void)
{
unsigned long size = INIT_TOKENS_SIZE (0);
summary_root = summary_leaf = CALLOC (struct summary, 1);
summary_root->sum_tokens_size = size;
summary_root->sum_tokens = MALLOC (struct token *, size);
}
struct summary *
make_sibling_summary (struct summary *summary)
{
struct summary *parent = summary->sum_parent;
unsigned long size;
if (parent == NULL)
{
levels++;
summary_root = summary->sum_parent = parent = CALLOC (struct summary, 1);
parent->sum_level = levels;
parent->sum_kids[0] = summary;
parent->sum_hits_count = summary->sum_hits_count;
parent->sum_free_index = 1;
size = INIT_TOKENS_SIZE (levels);
if (summary->sum_tokens_size >= size)
{
parent->sum_tokens_size = summary->sum_tokens_size;
parent->sum_tokens = summary->sum_tokens;
}
else
{
parent->sum_tokens_size = size;
parent->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
}
summary->sum_tokens = 0;
}
if (parent->sum_free_index == 8)
parent = make_sibling_summary (parent);
summary = CALLOC (struct summary, 1);
summary->sum_level = parent->sum_level - 1;
parent->sum_kids[parent->sum_free_index++] = summary;
summary->sum_parent = parent;
size = INIT_TOKENS_SIZE (summary->sum_level);
summary->sum_tokens_size = size;
summary->sum_tokens = MALLOC (struct token *, size);
return summary;
}
int
count_vec_size (struct summary *summary, unsigned char const *tail_hits)
{
struct summary **kids;
unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
kids = summary->sum_kids;
if (*kids == NULL)
{
static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4];
}
else
{
int bit;
int count = 0;
--tail_hits;
for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
if (bit & hits)
count += count_vec_size (*kids, tail_hits);
return count;
}
}
int
count_buf_size (struct summary *summary, unsigned char const *tail_hits)
{
struct summary **kids;
unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
kids = summary->sum_kids;
if (*kids == NULL)
return 1;
else
{
int bit;
int count = 1;
--tail_hits;
for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
if (bit & hits)
count += count_buf_size (*kids, tail_hits);
return count;
}
}
void
assert_hits (struct summary* summary)
{
struct summary **kids = summary->sum_kids;
struct summary **end = &kids[8];
assert (summary->sum_hits == NULL || *summary->sum_hits == 0);
if (end[-1] == 0)
while (*--end == 0)
;
while (kids < end)
assert_hits (*kids++);
}
void
write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits)
{
struct summary **kids;
unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits);
assert (hits);
putc (hits, fp);
kids = summary->sum_kids;
if (*kids)
{
int bit;
--tail_hits;
for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids)
if (bit & hits)
write_hits (fp, *kids, tail_hits);
}
}
void
sign_token (struct token *token)
{
unsigned char *tok_hits = token->tok_hits;
unsigned char *hits_sig = current_hits_signature;
unsigned char *end = ¤t_hits_signature[MAX_LEVELS];
struct summary *summary = summary_leaf;
while (summary)
{
if (*tok_hits == 0)
add_token_to_summary (summary, token);
if (*tok_hits & *hits_sig)
break;
*tok_hits |= *hits_sig;
summary = summary->sum_parent;
tok_hits++;
hits_sig++;
}
while (hits_sig < end)
{
if (*tok_hits & *hits_sig)
break;
*tok_hits |= *hits_sig;
tok_hits++;
hits_sig++;
}
}
void
add_token_to_summary (struct summary *summary, struct token *token)
{
unsigned long size = summary->sum_tokens_size;
if (summary->sum_hits_count >= size)
{
size *= 2;
summary->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
summary->sum_tokens_size = size;
}
summary->sum_tokens[summary->sum_hits_count++] = token;
}