home *** CD-ROM | disk | FTP | other *** search
Text File | 1989-06-27 | 16.8 KB | 600 lines | [TEXT/MPS ] |
- {[b+,c+,n+,u+,r+,rec+,#+,j=13-/40/1o,:+,t=2,o=80] PasMat }
- { readseq.p -- read nucleic/protein sequence in various formats
- -- file may have multiple sequences,
- -- based loosely on uwgcg & zuker ftn source
- }
-
- UNIT sequenceIO;
- INTERFACE
-
- TYPE
- Sequence = PACKED ARRAY [0..32000] OF CHAR;
- SeqPtr = ^Sequence;
-
- CONST
- {readSeq control calls}
- rseqFetch1 = 0; {fetch 1st seq, no prior read}
- rseqList = 1; {get list of available seq}
- rseqFetchN = 2; {fetch nth seq, with Inquire info}
-
- FUNCTION readSeq(fname: String; {file name}
- seq: SeqPtr; {sequence storage(0,2), or list(1)}
- VAR nseq: longint; {in: storage size, out: seq size}
- VAR seqId: String; {in(2): seq to fetch, out: seq info}
- control: integer {rseq control(0,1,2)}
- ): integer; {error code: 0=okay}
-
- IMPLEMENTATION
-
- FUNCTION readSeq(fname: String; {file name}
- seq: SeqPtr; {sequence storage(0,2), or list(1)}
- VAR nseq: longint; {in: storage size, out: seq size}
- VAR seqId: String; {in(2): seq to fetch, out: seq info}
- control: integer {rseq control(0,1,2)}
- ): integer; {error code: 0=okay}
-
- VAR
- f : text;
- err, i : integer;
- s, mySeq : String;
- done, gotuw, isfitch, addit, doadd: boolean;
- maxseq, ninfo: longint;
- seqCharSet : SET OF CHAR;
-
- PROCEDURE exitseq(err: integer);
-
- BEGIN
- IF control = rseqList THEN nseq := ninfo;
- close(f);
- readSeq := err;
- exit(readSeq);
- END;
-
- FUNCTION isSeqChar(c: CHAR): boolean;
- {---
- NOTE: (Var c:char) messes up ! (MPW Pas)
-
- GCG programs allow all upper and lower case
- letters, periods (.),
- asterisks (*), pluses (+), ampersands (&), and ats
- (@) as symbols in biological seqences.
- Note: Digits ['0'..'9'] are NOT allowed here.
- ----}
-
- BEGIN
- IF (c <= ' ') THEN isSeqChar := false
- ELSE IF (c IN seqCharSet) THEN isSeqChar := true
- ELSE isSeqChar := false;
- END; {isSeqChar}
-
- PROCEDURE addseq( s: String);
-
- VAR
- i : integer;
-
- BEGIN
- FOR i := 1 TO length(s) DO
- IF (nseq < maxseq) AND isSeqChar(s[i]) THEN BEGIN
- IF addit THEN seq^[nseq] := s[i];
- nseq := nseq + 1;
- END;
- END; {addseq}
-
- PROCEDURE addinfo(VAR s: string);
-
- VAR
- i, l : integer;
-
- BEGIN
- l := length(s) + 1;
- s[l] := chr(13);
- FOR i := 1 TO l DO
- IF (ninfo < maxseq) THEN BEGIN
- seq^[ninfo] := s[i];
- ninfo := ninfo + 1;
- END;
- END; {addinfo}
-
- PROCEDURE readLoop(margin: integer; endadd, firstadd: boolean; FUNCTION
- endTest(VAR s: String): boolean);
-
- BEGIN
- IF control = rseqFetchN THEN doadd := seqId = mySeq
- ELSE doadd := true;
- nseq := 0;
- IF firstadd THEN addseq(s); {! fitch 1st string}
- REPEAT
- {! check eof Before read !}
- done := eof(f);
- readln(f, s);
- done := done OR endTest(s);
- IF doadd AND (endadd OR NOT done) THEN BEGIN
- IF margin > 0 THEN delete(s, 1, margin);
- addseq(s);
- END;
- UNTIL done;
-
- CASE control OF
- rseqFetch1: exitseq(err);
- rseqList: addinfo(seqId);
- rseqFetchN: IF doadd THEN exitseq(err);
- END;
- END; {readLoop}
-
- FUNCTION endIG(VAR s: String): boolean;
-
- BEGIN
- endIG := (pos('1', s) > 0) OR (pos('2', s) > 0);
- END;
-
- PROCEDURE readIG; {IG -- many seqs/file }
-
- BEGIN
- WHILE true DO BEGIN
- REPEAT
- readln(f, s);
- UNTIL eof(f) OR ((s <> '') AND (pos(';', s) <> 1));
- IF eof(f) THEN exitseq(err);
- seqId := concat(s, ' [Stanford/IG 1]');
- readLoop(0, true, false, endIG);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos(';', s) <> 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readIG}
-
- FUNCTION endStrider(VAR s: String): boolean;
-
- BEGIN
- endStrider := (pos('//', s) > 0);
- END;
-
- PROCEDURE readStrider;
-
- BEGIN
- WHILE true DO BEGIN
- {Strider -- 1 seq only? }
- readln(f, s);
- seqId := concat(s, ' [DNA Strider 2]');
- delete(seqId, 1, 1);
- REPEAT
- readln(f, s);
- UNTIL eof(f) OR (pos(';', s) <> 1);
- readLoop(0, false, true, endStrider);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos(';', s) <> 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readStrider}
-
- FUNCTION endGB(VAR s: String): boolean;
-
- BEGIN
- endGB := (pos('//', s) > 0) OR (pos('LOCUS', s) = 1);
- END;
-
- PROCEDURE readGenBank;
-
- BEGIN
- WHILE true DO BEGIN
- {GenBank -- many seqs/file }
- seqId := concat(s, ' [GenBank 3]');
- delete(seqId, 1, 12);
- REPEAT
- readln(f, s);
- UNTIL eof(f) OR ((s <> '') AND (pos('ORIGIN', s) = 1));
- readLoop(9, false, false, endGB);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos('LOCUS', s) = 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readGenBank}
-
- FUNCTION endNBRF(VAR s: String): boolean;
-
- BEGIN
- endNBRF := (pos('>', s) = 1);
- END;
-
- PROCEDURE readNBRF;
-
- BEGIN
- WHILE true DO BEGIN
- {NBRF -- many seqs/file }
- seqId := concat(s, ' [NBRF/PIR 4]');
- delete(seqId, 1, 4);
- readln(f, s); {junk line}
- readLoop(0, false, false, endNBRF);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos('>DL;', s) = 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readNBRF}
-
- FUNCTION endPearson(VAR s: String): boolean;
-
- BEGIN
- endPearson := (pos('>', s) = 1);
- END;
-
- PROCEDURE readPearson;
-
- BEGIN
- WHILE true DO BEGIN
- {Pearson -- many seqs/file }
- seqId := concat(s, ' [Pearson 5]');
- delete(seqId, 1, 1);
- readLoop(0, false, false, endPearson);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos('>', s) = 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readPearson}
-
- FUNCTION endEMBL(VAR s: String): boolean;
-
- BEGIN
- endEMBL := (pos('ID ', s) = 1);
- END;
-
- PROCEDURE readEMBL;
-
- BEGIN
- WHILE true DO BEGIN
- seqId := concat(s, ' [EMBL 6]');
- delete(seqId, 1, 5);
- REPEAT
- readln(f, s);
- UNTIL eof(f) OR (pos('SQ ', s) = 1);
- readLoop(0, false, false, endEMBL);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos('ID ', s) = 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readEMBL}
-
- FUNCTION endZuker(VAR s: String): boolean;
-
- BEGIN
- endZuker := (pos('(', s) = 1);
- END;
-
- PROCEDURE readZuker;
-
- BEGIN
- WHILE true DO BEGIN
- {Zuker -- many seq/file ?}
- {! 1st string is Zuker's Fortran format }
- readln(f, s); {s == "seqLen seqId string..."}
- seqId := concat(s, ' [NRC 7]');
- delete(seqId, 1, 6);
- readLoop(0, false, false, endZuker);
- WHILE NOT (eof(f) OR ((s <> '') AND (pos('(', s) = 1))) DO
- readln(f, s);
- IF eof(f) THEN exitseq(err);
- END;
- END; {readZuker }
-
- FUNCTION endFitch(VAR s: String): boolean;
-
- BEGIN
- endFitch := (s[1] <> ' ');
- END;
-
- PROCEDURE readFitch;
-
- VAR
- first : boolean;
-
- BEGIN
- first := true;
- WHILE true DO BEGIN
- {Fitch -- many seqs/file }
- IF NOT first THEN seqId := s;
- seqId := concat(seqId, ' [Fitch 8]');
- readLoop(0, false, first, endFitch);
- IF eof(f) THEN exitseq(err);
- first := false;
- END;
- END; {readFitch }
-
- PROCEDURE readPreer;
- {Used locally at InU only
- control=99=rseqList else rseqFetch1 }
-
- BEGIN
- {readPreer -- 1seq/file}
- if control=99 then control:= rSeqList
- else control:= rseqFetch1;
- seqCharSet := seqCharSet + ['0'..'9']; {really, just digits}
- addit := control <> rSeqList;
- readln(f, s); { skip 1st line = line count }
- seqId := concat(fname, ' [Preer format Z]');
- REPEAT
- readln(f, s);
- done := eof(f);
- IF NOT done THEN addseq(s); {skip last line=index}
- UNTIL done;
-
- IF control = rSeqList THEN addinfo(seqId);
- exitseq(err);
- END; {readPreer }
-
- PROCEDURE readUnknown;
-
- BEGIN
- {Unknown -- 1seq/file}
-
- addit := control <> rseqList;
- addseq( seqId); {from above..}
- seqId := concat(fname, ' [Unknown format, assume all sequence 0]');
- REPEAT
- addseq(s);
- done := eof(f);
- readln(f, s);
- UNTIL done;
-
- CASE control OF
- rseqFetchN, rseqFetch1: ;
- rseqList: addinfo(seqId);
- END;
- exitseq(err);
- END; {readUnknown }
-
- PROCEDURE readUWGCG;
-
- BEGIN
- { UWGCG -- 1seq/file}
- addit := control <> rseqList;
- seqId := concat(s, ' [UWGCG 9]');
- REPEAT
- done := eof(f);
- readln(f, s);
- IF NOT done THEN BEGIN
- delete(s, 1, 9); {skip margin}
- addseq(s);
- { if nseq=0 then sysbeep(1); !true }
- END;
- UNTIL done;
-
- CASE control OF
- rseqFetchN, rseqFetch1: ;
- rseqList: addinfo(seqId);
- END;
- exitseq(err);
- END; {readUWGCG }
-
- BEGIN
-
- maxseq := nseq;
- nseq := 0;
- ninfo := 0;
- mySeq := seqId; {for rseqFetchN}
- seqId := '';
- readSeq := 0;
- addit := control <> rseqList;
- seqCharSet := ['A'..'Z', 'a'..'z', '_', '@', '+', '-', '*', '.', '&'];
-
- reset(f, fname);
- err := ioresult;
- IF err <> 0 THEN exitseq(err);
-
- {InU only fix for Preer data: }
- IF (control IN [98, 99]) THEN readPreer;
-
- IF control = rseqFetchN THEN BEGIN
- {don't need to re-check format}
- REPEAT
- readln(f, s);
- UNTIL (s <> '') OR eof(f);
- IF (s = '') THEN exitseq( - 1);
- CASE mySeq[length(mySeq) - 1] OF
- '0': readUnknown;
- '1': readIG;
- '2': readStrider;
- '3': readGenBank;
- '4': readNBRF;
- '5': readPearson;
- '6': readEMBL;
- '7': readZuker;
- '8': BEGIN
- seqId := s;
- readln(f, s);
- readFitch;
- END;
- '9':
- REPEAT
- gotuw := pos('..', s) > 0;
- IF gotuw THEN readUWGCG;
- readln(f, s);
- UNTIL eof(f);
- END;
- END;
-
- { check for ".." of uwgcg, since it can masquerade as any
- other format }
- i := 0;
- REPEAT
- readln(f, s);
- i := i + 1;
- gotuw := pos('..', s) > 0;
- IF gotuw THEN gotuw := pos('Check:', s) > 0;
- done := gotuw OR eof(f) OR (i > 500);
- {! ECOLAC UW/GenBank document header is 300 lines !}
- IF (i < 5) AND (pos(';', s) = 1) THEN BEGIN
- gotuw := false; done := true;
- {fix for ToIG of UWGCG ... also NBRF/EMBL ?}
- END
- UNTIL done;
-
- IF gotuw THEN readUWGCG
- ELSE BEGIN
- reset(f);
- REPEAT
- readln(f, s);
- UNTIL (s <> '') OR eof(f);
- IF (s = '') THEN exitseq( - 1);
- END;
-
- IF pos(';', s) = 1 THEN BEGIN
- IF pos('Strider', s) > 0 THEN readStrider
- ELSE readIG;
- END
-
- ELSE IF pos('LOCUS', s) = 1 THEN readGenBank
-
- ELSE IF pos('>DL;', s) = 1 THEN readNBRF
-
- ELSE IF pos('>', s) = 1 THEN readPearson
-
- ELSE IF pos('ID ', s) = 1 THEN readEMBL
-
- ELSE IF pos('(', s) = 1 THEN readZuker
-
- ELSE BEGIN
- seqId := s;
- readln(f, s); {test for fitch format}
- i := 1;
- REPEAT
- isfitch := (((i - 1) MOD 4 = 0) AND (s[i] = ' ')) OR (((i - 1) MOD
- 4 <> 0) AND (s[i] <> ' '));
- i := i + 1;
- UNTIL (i >= length(s)) OR NOT isfitch;
- IF isfitch THEN readFitch
- ELSE readUnknown;
- END;
-
- END; {readSeq}
-
- END.
-
- {
- sequence formats....
- ---------------------------------------------------
-
- stanford/IG
- ;comments
- ;...
- seq1 info
- abcd...
- efgh1 (or 2 = terminator)
- ;another seq
- ;....
- seq2 info
- abcd...1
- --- for e.g. ----
- ; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 ..
- dro5stseq
- GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
- GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
-
- ; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120
- ---------------------------------------------------
-
- Genbank:
- LOCUS seq1 ID..
- ...
- ORIGIN ...
- 123456789abcdefg....(1st 9 columns are formatting)
- hijkl...
- // (end of sequence)
- LOCUS seq2 ID ..
- ...
- ORIGIN
- abcd...
- //
- ---------------------------------------------------
-
- PIR format
- > seq1 id
- ?? junk 2nd line
- abcdefg...
- hijkl...
- > seq2 ID
- ?? junk
- abcd....
- ---------------------------------------------------
-
- NBRF format: (from uwgcg ToNBRF) == PIR format
- >DL;DRO5SRNA
- Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
-
- 1