home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Crawly Crypt Collection 1
/
crawlyvol1.bin
/
apps
/
science
/
clustalv
/
sequence.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-04-11
|
8KB
|
336 lines
/********* Sequence input routines for CLUSTALV *******************/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include "clustalv.h"
/*
* Prototypes
*/
extern Boolean linetype(char *,char *);
extern void warning(char *,...);
extern void error(char *,...);
extern char * rtrim(char *);
extern void getstr(char *,char *);
void fill_chartab(void);
int readseqs(int);
static void get_seq(char *,char *,int *,char *);
static void check_infile(int *);
static void p_encode(char *, char *, int);
static void n_encode(char *, char *, int);
static int res_index(char *,char);
static Boolean check_dnaflag(char *, int);
/*
* Global variables
*/
extern FILE *fin;
extern Boolean usemenu, dnaflag, explicit_dnaflag;
extern char seqname[];
extern int nseqs;
extern int *seqlen_array;
extern char **names,**titles;
extern char **seq_array;
extern profile1_empty, profile2_empty;
char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-";
char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW";
char *nucleic_acid_order = "NACGTU";
static int seqFormat;
static char chartab[256];
static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR","Pearson"};
void fill_chartab() /* Create translation and check table */
{
/* static char valid[]="ABCDEFGHIKLMNPQRSTVWXYZ-"; */
register int i;
register char c;
for(i=0;i<256;chartab[i++]=0);
for(i=0;c=amino_acid_codes[i];i++)
chartab[c]=chartab[tolower(c)]=c;
}
static void get_seq(char *sname,char *seq,int *len,char *tit)
{
static char line[MAXLINE+1];
int i;
unsigned char c;
switch(seqFormat) {
case EMBLSWISS:
while( !linetype(line,"ID") )
fgets(line,MAXLINE+1,fin);
strncpy(sname,line+5,MAXNAMES); /* remember entryname */
sname[MAXNAMES]=EOS;
rtrim(sname);
/* while( !linetype(line,"DE") )
fgets(line,MAXLINE+1,fin);
strncpy(tit,line+5,MAXTITLES);
tit[MAXTITLES]=EOS;
i=strlen(tit);
if(tit[i-1]=='\n') tit[i-1]=EOS;
*/
while( !linetype(line,"SQ") )
fgets(line,MAXLINE+1,fin);
*len=0;
while(fgets(line,MAXLINE+1,fin)) {
for(i=0;*len < MAXLEN;i++) {
c=line[i];
if(c == '\n' || c == EOS || c == '/')
break; /* EOL */
if( (c=chartab[c]))
seq[++(*len)]=c;
}
if(*len == MAXLEN || c == '/') break;
}
break;
case PIR:
while(*line != '>')
fgets(line,MAXLINE+1,fin);
strncpy(sname,line+4,MAXNAMES);
sname[MAXNAMES]=EOS;
for(i=MAXNAMES-1;i > 0;i--)
if(isspace(sname[i])) {
sname[i]=EOS;
break;
}
fgets(line,MAXLINE+1,fin);
strncpy(tit,line,MAXTITLES);
tit[MAXTITLES]=EOS;
i=strlen(tit);
if(tit[i-1]=='\n') tit[i-1]=EOS;
*len=0;
while(fgets(line,MAXLINE+1,fin)) {
for(i=0;*len < MAXLEN;i++) {
c=line[i];
if(c == '\n' || c == EOS || c == '*')
break; /* EOL */
if( (c=chartab[c]))
seq[++(*len)]=c;
}
if(*len == MAXLEN || c == '*') break;
}
break;
case PEARSON:
while(*line != '>')
fgets(line,MAXLINE+1,fin);
strncpy(sname,line+1,MAXNAMES);
sname[MAXNAMES]=EOS;
for(i=MAXNAMES-1;i > 0;i--)
if(isspace(sname[i])) {
sname[i]=EOS;
break;
}
*tit=EOS;
*len=0;
while(fgets(line,MAXLINE+1,fin)) {
for(i=0;*len < MAXLEN;i++) {
c=line[i];
if(c == '\n' || c == EOS || c == '>')
break; /* EOL */
if( (c=chartab[c]))
seq[++(*len)]=c;
}
if(*len == MAXLEN || c == '>') break;
}
break;
}
if(*len == MAXLEN)
warning("Sequence %s truncated to %d residues",
sname,MAXLEN);
seq[*len+1]=EOS;
}
int readseqs(int first_seq) /*first_seq is the #no. of the first seq. to read */
{
char line[FILENAMELEN+1];
static char seq1[MAXLEN+2],sname1[MAXNAMES+1],title[MAXTITLES+1];
int i,j,no_seqs;
static int l1;
static Boolean dnaflag1;
if(usemenu)
getstr("Enter the name of the sequence file",line);
else
strcpy(line,seqname);
if(*line == EOS) return -1;
if((fin=fopen(line,"r"))==NULL) {
error("Could not open sequence file %s",line);
return -1; /* DES -1 => file not found */
}
strcpy(seqname,line);
no_seqs=0;
check_infile(&no_seqs);
printf("\nSequence format is %s\n",formatNames[seqFormat]);
if(no_seqs == 0)
return 0; /* return the number of seqs. (zero here)*/
if((no_seqs + first_seq -1) > MAXN) {
error("Too many sequences. Maximum is %d",MAXN);
return 0; /* also return zero if too many */
}
for(i=first_seq;i<=first_seq+no_seqs-1;++i) { /* get the seqs now*/
get_seq(sname1,seq1,&l1,title);
seqlen_array[i]=l1; /* store the length */
strcpy(names[i],sname1); /* " " name */
strcpy(titles[i],title); /* " " title */
if(!explicit_dnaflag) {
dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
if(i == 1) dnaflag = dnaflag1;
} /* type decided by first seq*/
else
dnaflag1 = dnaflag;
if( (!explicit_dnaflag) && (dnaflag1 != dnaflag) )
warning(
"Sequence %d [%s] appears to be of different type to sequence 1",
i,sname1);
if(dnaflag)
n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
else /* as ints */
p_encode(seq1,seq_array[i],l1);
}
fclose(fin);
return no_seqs; /* return the number of seqs. read in this call */
}
static Boolean check_dnaflag(char *seq, int slen)
/* check if DNA or Protein
The decision is based on counting all A,C,G,T,U or N.
If >= 85% of all characters (except -) are as above => DNA */
{
int i, c, nresidues, nbases;
float ratio;
nresidues = nbases = 0;
for(i=1; i <= slen; i++) {
if(seq[i] != '-') {
nresidues++;
if(seq[i] == 'N')
nbases++;
else {
c = res_index(nucleic_acid_order, seq[i]);
if(c > 0)
nbases++;
}
}
}
if( (nbases == 0) || (nresidues == 0) ) return FALSE;
ratio = (float)nbases/(float)nresidues;
/*
fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
nbases,nresidues,ratio);
*/
if(ratio >= 0.85)
return TRUE;
else
return FALSE;
}
static void check_infile(int *nseqs)
{
char line[MAXLINE+1];
*nseqs=0;
fgets(line,MAXLINE+1,fin);
if( linetype(line,"ID") ) /* EMBL/Swiss-Prot format ? */
seqFormat=EMBLSWISS;
else if(*line == '>') /* no */
seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
else {
seqFormat=UNKNOWN;
return;
}
(*nseqs)++;
while(fgets(line,MAXLINE+1,fin) != NULL) {
switch(seqFormat) {
case EMBLSWISS:
if( linetype(line,"ID") )
(*nseqs)++;
break;
case PIR:
case PEARSON:
if( *line == '>' )
(*nseqs)++;
break;
default:
break;
}
}
fseek(fin,0,0);
}
static void p_encode(char *seq, char *naseq, int l)
{ /* code seq as ints .. use -2 for gap */
register int i;
/* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
for(i=1;i<=l;i++)
if(seq[i] == '-')
naseq[i] = -2;
else if(seq[i] == 'X')
naseq[i] = 0;
else
naseq[i] = res_index(amino_acid_order,seq[i]);
}
static void n_encode(char *seq,char *naseq,int l)
{ /* code seq as ints .. use -2 for gap */
register int i,c;
/* static char *nucs="ACGTU"; */
for(i=1;i<=l;i++) {
if(seq[i] == '-') /* if a gap character -> code = -2 */
naseq[i] = -2; /* this is the code for a gap in */
else { /* the input files */
c=res_index(nucleic_acid_order,seq[i]);
if (c == 5) c=4;
naseq[i]=c;
}
}
}
static int res_index(char *t,char c)
{
register int i;
for(i=0;t[i] && t[i] != c;i++)
;
if(t[i]) return(i);
else return 0;
}