home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
ftp.barnyard.co.uk
/
2015.02.ftp.barnyard.co.uk.tar
/
ftp.barnyard.co.uk
/
cpm
/
walnut-creek-CDROM
/
CPM
/
TURBOPAS
/
MAPSTATF.LBR
/
CLUSTER.PZS
/
CLUSTER.ÐAS
Wrap
Text File
|
2000-06-30
|
8KB
|
278 lines
(* Multivariate Analysis Package - Kmeans Clustering Module
Copyright 1986 Douglas L. Anderton. This program may be freely
circulated so long as it is not sold for profit and any charge does
not exceed costs of reproduction *)
Program Cluster(Input,Output);
Const
N=10; { number of vars in clustering }
C=25; { max-min number of clusters to consider, e.g. between 50 and
20 (or 100 and 70, etc.) clusters should be examined }
NBYC=250;
WORK=3500;
Type
SUBS=1..N;
NCLS=1..C;
NSAV=1..NBYC;
RVEC = Array [SUBS] Of Real;
RVC2 = Array [NCLS] Of Real;
RVC3 = Array [NSAV] Of Real;
LVEC = Array [1..WORK] Of Real; { limited only by memory }
IVEC = Array [SUBS] Of Integer;
S8 = Array [SUBS] Of String[8];
Var
dfile, ofile, tfile : Text;
sel : IVEC;
varn : S8;
vars, miss : RVEC;
i, j, k, l, nc, n1, n2, nv : Integer;
dv, ot, f : Byte;
x : LVEC;
s : RVC3;
e : RVC2;
d : Real;
df : String[1];
tfl : String[12];
{$I TRANSBUF.LIB}
Procedure openfiles(Var dfile, ofile:Text; Var ot:Byte);
Var
dfl, ofl:String[12];
Begin
ClrScr; Writeln(' *** CLUSTER: KMEANS CLUSTERING ***');
Writeln; Write('Name of the data file? ');
Readln(dfl); Assign(dfile,dfl); Reset(dfile);
Write('Name of the output file? ');
Readln(ofl); Assign(ofile,ofl); Rewrite(ofile);
ot:= 0;
If (ofl='CON:') Or (ofl='con:') Then ot:=1;
If (ofl='LST:') Or (ofl='lst:') Then ot:=2;
If (ot=2) Then
Begin
Writeln(ofile,'Multivariate Analysis Package (1.6) - ',
'Program: CLUSTER, Datafile: ',dfl); Writeln(ofile);
End;
End; (* Of openfiles *)
Procedure wait(ot:Byte);
Begin
If ot=1 Then Begin
Write('- Press any key to continue -'); While Not KeyPressed Do; ClrScr;
End;
End; (* of wait *)
Procedure selectvar(Var sel:IVEC; Var varn:S8; Var miss:RVEC;
Var nv:Integer; Var dv:Byte);
Var
cfile:Text;
cfl:String[12];
i,j,f:Integer;
mis:Real;
van:String[8];
Begin
Write('Name of the codebook file (or NONE)? '); Readln(cfl);
If (cfl<>'NONE') And (cfl<>'none') Then f:=1 Else f:=0;
If f=1 Then Begin Assign(cfile,cfl); Reset(cfile); End;
Writeln;
Write('How many variables in data file? '); Readln(nv);
Write('Number of variables to use in CLUSTER? '); Readln(dv);
Writeln('If multiple variables specified all should be comparable scale.');
For i := 1 To dv Do
Begin
Write('Column number for variable ',i,'? '); Readln(sel[i]);
Str(sel[i]:3,varn[i]); miss[i]:=-1E37;
End;
If f=1 Then Begin
For j:=1 to nv Do Begin
mis:=-1E37;
Readln(cfile,f,van,mis);
For i:=1 to dv Do
If f=sel[i] Then Begin
varn[i]:=van; miss[i]:=mis;
Writeln('Col: ',sel[i],' Name: ',varn[i],' Missing: ',miss[i]:6);
End;
End;
Close(cfile);
wait(1);
End;
End; (* Of selectvar *)
Procedure getcase(Var vars:RVEC; sel:IVEC; nv, dv:Integer; Var dfile:Text);
Var
i, j:Integer;
x:Real;
Begin
For i:=1 To nv Do
Begin
Read(dfile,x);
For j:=1 To dv Do If (sel[j]=i) Then vars[j]:=x;
End;
End; (* Of getcase *)
Procedure kmeans(Var x:LVEC; Var s:RVC3; Var e:RVC2; Var l, nc:Integer;
Var dv:Byte; Var d:Real);
Var
q: Array [NCLS] Of Integer;
i, it, j, k, r, v, w: Integer;
a, b, f, h, g, t: Real;
Begin
For i:=1 To l Do
Begin
q[i]:=0; e[i]:=0.0;
For j:=1 To dv Do s[(i-1)*dv+j]:=0.0;
End;
For i:=1 To nc Do
Begin
k:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1); q[k]:=q[k]+1;
For j:=1 To dv Do s[(k-1)*dv+j]:=s[(k-1)*dv+j]+x[(i-1)*(dv+1)+j];
End;
For i:=1 To l Do
For j:=1 To dv Do s[(i-1)*dv+j]:=s[(i-1)*dv+j]/q[i];
d:=0.;
For i:=1 To nc Do
Begin
f:=0; k:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1);
For j:=1 To dv Do f:=f+Sqr(s[(k-1)*dv+j]-x[(i-1)*(dv+1)+j]);
e[k]:=e[k]+f;
d:=d+f;
End;
i:=0; it:=0;
While it<nc Do
Begin
i:=i+1; If i>nc Then i:=i-nc;
r:=Trunc(x[(i-1)*(dv+1)+dv+1]+0.1);
If q[r]>1 Then
Begin
h:=q[r]; h:=h/(h-1.0); f:=0;
For j:=1 To dv Do f:=f+Sqr(s[(r-1)*dv+j]-x[(i-1)*(dv+1)+j]);
a:=h*f; b:=1E30;
For j:=1 To l Do
Begin
If j<>r Then
Begin
h:=q[j]; h:=h/(h+1.0); f:=0;
For k:=1 To dv Do f:=f+Sqr(s[(j-1)*dv+k]-x[(i-1)*(dv+1)+k]);
f:=h*f;
If f<=b Then
Begin
b:=f; v:=j; w:=q[j];
End;
End;
End;
If b>=a Then
Begin
it:=it+1;
Write(' ',it);
End
Else
Begin
it:=0; e[r]:=e[r]-a; e[v]:=e[v]+b; d:=d-a+b;
h:=q[r]; g:=w; a:=1./(h-1.0); b:=1./(g+1.0);
For k:=1 To dv Do
Begin
s[(r-1)*dv+k]:=a*(h*s[(r-1)*dv+k]-x[(i-1)*(dv+1)+k]);
s[(v-1)*dv+k]:=b*(g*s[(v-1)*dv+k]+x[(i-1)*(dv+1)+k]);
End;
x[(i-1)*(dv+1)+dv+1]:=v; q[r]:=q[r]-1; q[v]:=q[v]+1;
End;
End;
End;
End; (* Of kmeans *)
Begin (* main *)
openfiles(dfile, ofile, ot);
selectvar(sel, varn, miss, nv, dv);
(* initialize *)
ClrScr; Writeln; n1:=0; n2:=C+1;
While (n2-n1)>C Do Begin
Write('Minimum number of Clusters to investigate? ');Read(n1);
Write(' Maximum number? ');Readln(n2);
If(n2-n1)>C Then Writeln('Too many, specify smaller range.');
End;
(* read in matrix *)
Writeln;
nc:=0;
While Not EOF(dfile) Do
Begin
getcase(vars, sel, nv, dv, dfile);
If Frac(nc/10)=0.0 Then Write('+');
If Not EOF(dfile) Then
Begin
f:=0;
For j:=1 To dv Do If vars[j]=miss[j] Then f:=1;
If f=0 Then
Begin
nc:=nc+1;
For j:=1 To dv Do
Begin
l:=(nc-1)*(dv+1)+j;
If l>WORK Then
Begin
Writeln('*** STORAGE EXCEEDED: Try Fewer (MAX-MIN) Clusters ***');
bdos(0);
End;
x[l]:=vars[j];
End;
End;
End;
End;
Reset(dfile);
(* compute And Output *)
For l:=n1 To n2 Do
Begin
k:=0;
For j:=1 To nc Do x[(j-1)*(dv+1)+dv+1]:=j-(j div l)*l+1;
ClrScr;
Writeln('Iterating on ',l:2,' clusters.');
kmeans(x,s,e,l,nc,dv,d);
ClrScr;
Writeln(ofile);
Writeln(ofile,'Total SSQ Distances Within ',l:2,' Clusters: ',d:10);
If ot<>1 Then
Writeln('Total SSQ Distances Within ',l:2,' Clusters: ',d:10);
wait(ot);
Writeln(ofile);
For i:=1 To l Do
Begin
Writeln(ofile,'Cluster: ',i:2,' Within SSQ: ',e[i]:10);
Writeln(ofile,' Centroids:');
For j:=1 To dv Do
Begin
Write(ofile,' ',sel[j]:2,': ',s[(i-1)*dv+j]:10);
If (j=5) And (dv>5) Then Writeln(ofile);
End;
Writeln(ofile);
If ((Frac(i/5)=0.0) Or (i=l)) Then wait(ot);
End;
Writeln('Would you like data and cluster membership for this level');
Write('written to a merged file (y/n)? '); Readln(df);
If (df='y') Or (df='Y') Then
Begin
Writeln;
Write('Name of new output file? '); Readln(tfl);
initwrite(tfl);
k:=0;
While Not EOF(dfile) Do
Begin
f:=0; For i:=1 To nv Do Read(dfile,s[i]);
If Frac(k/10)=0.0 Then Write('+');
If Not EOF(dfile) Then
Begin
For i:=1 To nv Do
For j:=1 To dv Do If (sel[j]=i) And (s[i]=miss[j]) Then f:=1;
s[nv+1]:=-1E37;
If f=0 Then
Begin k:=k+1; s[nv+1]:=x[(k-1)*(dv+1)+dv+1]; End;
For i:=1 To nv+1 Do Write(Usr,s[i]:9,' ');
Writeln(Usr);
End;
End;
Close(dfile); endwrite; Assign(dfile,tfl); Reset(dfile); nv:=nv+1;
End;
ClrScr;
End;
Close(dfile); Close(ofile);
Assign(dfile,'MAPSTAT.COM'); Execute(dfile);
End.
wait(ot)