The World of Computer Software

home *** CD-ROM | disk | FTP | other *** search

/ The World of Computer Software / World_Of_Computer_Software-02-385-Vol-1of3.iso / r / rcc1.zip / rcc1 / rcc1.lisp < prev

Wrap

Lisp/Scheme | 1991-10-10 | 62KB | 1,598 lines

;;; -*- Mode:Lisp -*- ;;; *************************************************************************** ;;; Recurrent Cascade-Correlation simulator, Common Lisp version. ;;; ;;; Written by: Scott E. Fahlman ;;; School of Computer Science ;;; Carnegie-Mellon University ;;; Pittsburgh, PA 15217 ;;; ;;; Phone: (412) 268-2575 ;;; Internet: fahlman@cs.cmu.edu ;;; ;;; This code has been placed in the public domain by the author. As a ;;; matter of simple courtesy, anyone using or adapting this code is ;;; expected to acknowledge the source. The author would like to hear ;;; about any attempts to use this system, successful or not. ;;; ;;; For an explanation of this algorithm and some results, see "The ;;; Recurrent Cascade-Correlation Learning Architecture" by Scott E. ;;; Fahlman in D. S. Touretzky (ed.), "Advances in Neural Information ;;; Processing Systems 3", Morgan Kaufmann, 1991. A somewhat longer ;;; version is available as CMU Computer Science Tech Report CMU-CS-91-100. ;;; *************************************************************************** ;;; This proclamation buys a certain amount of overall speed at the expense ;;; of runtime checking. Comment it out when debugging new, bug-infested code. (proclaim '(optimize (speed 3) (space 0) (safety 0))) ;;; Style note: Because some of these runs take a long time, this code is ;;; extensively hacked for good performance under CMU Common Lisp. ;;; Elegance and clarity have in some cases been sacrificed for speed. ;;; The EXTENSIONS:*IGNORE-FLOATING-POINT-UNDERFLOW* switch, if non-null, ;;; says that floating point underflows should quietly return zero rather ;;; than signalling an error. Underflows occur only in a few tasks, so it ;;; may be possible to get along without a switch of this sort. ;;; (setq extensions:*ignore-floating-point-underflow* t) ;;; Compensate for the clumsy Common Lisp declaration system and weak ;;; type-inference in some primitive Common Lisp compilers. INCF-SF, *SF, ;;; etc. are like INCF, *, etc., but they declare their operands and ;;; results to be short-floats. The code gets unreadable quickly if you ;;; insert all these declarations by hand. (defmacro incf-sf (place &optional (increment 1.0)) `(the short-float (incf (the short-float ,place) (the short-float ,increment)))) (defmacro decf-sf (place &optional (increment 1.0)) `(the short-float (decf (the short-float ,place) (the short-float ,increment)))) (defmacro *sf (&rest args) `(the short-float (* ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args)))) (defmacro +sf (&rest args) `(the short-float (+ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args)))) (defmacro -sf (&rest args) `(the short-float (- ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args)))) (defmacro /sf (&rest args) `(the short-float (/ ,@(mapcar #'(lambda (x) (list 'the 'short-float x)) args)))) ;;; DOTIMES1 is like DOTIMES, only with the loop counter declared as a ;;; fixnum. This is for compilers with weak type inference. (defmacro dotimes1 (form1 &body body) `(dotimes ,form1 (declare (fixnum ,(car form1))) . ,body)) ;;; Create vector-access forms similar to SVREF, but for vectors of ;;; element-type SHORT-FLOAT and FIXNUM. (eval-when (compile load eval) (defconstant fvector-type (array-element-type (make-array '(1) :element-type 'short-float))) (defconstant ivector-type (array-element-type (make-array '(1) :element-type 'fixnum)))) (defmacro fvref (a i) "Like SVREF, but with vectors of element-type SHORT-FLOAT." (if (eq fvector-type t) `(the short-float (svref ,a ,i)) `(the short-float (aref (the (simple-array ,fvector-type (*)) ,a) ,i)))) (defmacro ivref (a i) "Like SVREF, but with vectors of element-type FIXNUM." (if (eq ivector-type t) `(the fixnum (svref ,a ,i)) `(the fixnum (aref (the (simple-array ,ivector-type (*)) ,a) ,i)))) ;;;; Assorted Parameters and Controls. ;;; Thse parameters and switches control the quickprop learning algorithm ;;; used to train the output weights and candidate units. (defvar *unit-type* :sigmoid "The type of activation function used by the hidden units. Options currently implemented are :sigmoid, :asigmoid, and :gaussian. Sigmoid is symmetric in range -0.5 to +0.5, while Asigmoid is asymmetric, 0.0 to 1.0.") (defvar *output-type* :sigmoid "The activation function to use on the output units. Options currently implemented are :linear and :sigmoid.") (defvar *sigmoid-prime-offset* 0.1 "This is added to the derivative of the sigmoid function to prevent the system from getting stuck at the points where sigmoid-prime goes to zero.") (proclaim '(short-float *sigmoid-prime-offset*)) (defvar *weight-range* 1.0 "Input weights in the network get inital random values between plus and minus *weight-range*. This parameter also controls the initial weights on direct input-to-output links.") (proclaim '(short-float *weight-range*)) (defvar *weight-multiplier* 1.0 "The output weights for cadidate units get an initial value that is the negative of the correlation times this factor.") (proclaim '(short-float *weight-multiplier*)) (defvar *output-mu* 2.0 "Mu parmater used for quickprop training of output weights. The step size is limited to mu times the previous step.") (proclaim '(short-float *output-mu*)) (defvar *output-shrink-factor* (/ *output-mu* (+ 1.0 *output-mu*)) "Derived from *output-mu*. Used in computing whether the proposed step is too large.") (proclaim '(short-float *output-shrink-factor*)) (defvar *output-epsilon* 0.35 "Controls the amount of linear gradient descent to use in updating output weights.") (proclaim '(short-float *output-epsilon*)) (defvar *output-decay* 0.0001 "This factor times the current weight is added to the slope at the start of each output-training epoch. Keeps weights from growing too big.") (proclaim '(short-float *output-decay*)) (defvar *output-patience* 12 "If we go for this many epochs with no significant change, it's time to stop tuning. If 0, go on forever.") (proclaim '(fixnum *output-patience*)) (defvar *output-change-threshold* 0.01 "The error must change by at least this fraction of its old value in order to count as a significant change.") (proclaim '(short-float *output-change-threshold*)) (defvar *input-mu* 2.0 "Mu parmater used for quickprop training of input weights. The step size is limited to mu times the previous step.") (proclaim '(short-float *input-mu*)) (defvar *input-shrink-factor* (/ *input-mu* (+ 1.0 *input-mu*)) "Derived from *input-mu*. Used in computing whether the proposed step is too large.") (proclaim '(short-float *input-shrink-factor*)) (defvar *input-epsilon* 1.0 "Controls the amount of linear gradient descent to use in updating unit input weights.") (proclaim '(short-float *input-epsilon*)) (defvar *input-decay* 0.0 "This factor times the current weight is added to the slope at the start of each output-training epoch. Keeps weights from growing too big.") (proclaim '(short-float *input-decay*)) (defvar *input-patience* 8 "If we go for this many epochs with no significant change, it's time to stop tuning. If 0, go on forever.") (proclaim '(fixnum *input-patience*)) (defvar *input-change-threshold* 0.03 "The correlation score for the best unit must change by at least this fraction of its old value in order to count as a significant change.") (proclaim '(short-float *input-change-threshold*)) ;;; Variables related to error and correlation. (defvar *score-threshold* 0.4 "An output is counted as correct for a given case if the difference between that output and the desired value is smaller in magnitude than this value.") (proclaim '(short-float *score-threshold*)) (defvar *error-bits* 0 "Count number of bits in epoch that are wrong by more than *SCORE-THRESHOLD*") (proclaim '(fixnum *error-bits*)) (defvar *true-error* 0.0 "The sum-squared error at the network outputs. This is the value the algorithm is ultimately trying to minimize.") (proclaim '(short-float *true-error*)) (defvar *sum-sq-error* 0.0 "Accumulate the sum of the squared error values after output training phase.") (proclaim '(short-float *sum-sq-error*)) (defvar *best-candidate-score* 0.0 "The best correlation score found among all candidate units being trained.") (proclaim '(short-float *best-candidate-score*)) (defvar *best-candidate* 0 "The index of the candidate unit whose correlation score is best at present.") (proclaim '(fixnum *best-candidate*)) ;;; These variables and switches control the simulation. (defvar *use-cache* t "If T, cache the forward-pass values instead of repeatedly computing them. This can save a *lot* of time if all the cached values fit into memory.") (defparameter *epoch* 0 "Count of the number of times the entire training set has been presented.") (proclaim '(fixnum *epoch*)) (defvar *done* nil "Set to T whenever some problem-specific test function wants to abort processing.") (defvar *test* t "If T, run a test epoch every so often during output training.") (defvar *test-interval* 0 "Run a test epoch every *test-interval* output training cycles.") (proclaim '(fixnum *test-interval*)) (defvar *single-pass* nil "When on, pause after next forward/backward cycle.") (defvar *single-epoch* nil "When on, pause after next training epoch.") (defparameter *step* nil "Turned briefly to T in order to continue after a pause.") ;;; The sets of training inputs and outputs are stored in parallel vectors. ;;; Each element is a SIMPLE-VECTOR holding short-float values, one for ;;; each input or output. Note: this is a simple vector, not a specialized ;;; vector of element-type short-float. (defvar *training-inputs* (make-array 0) "Vector of input patterns for training the net.") (proclaim '(simple-vector *training-inputs*)) (defvar *training-outputs* (make-array 0) "Vector of output patterns for training the net.") (proclaim '(simple-vector *training-outputs*)) (defvar *use-training-breaks* nil "If true, use the training breaks vector. Else, never break.") (defvar *training-breaks* (make-array 0) "If *use-training-breaks* is true, this must be a simple vector with one entry, T or NIL, for each training case. If T, zero all accumulated state before processing this case.") (proclaim '(type simple-vector *training-breaks*)) ;;; Test inputs and outputs are just the same in form as training inputs ;;; and outputs. If there is to be no distinct test set, just set ;;; *test-inputs* EQ to *training-inputs*. (defvar *test-inputs* *training-inputs* "Vector of input patterns for testing the net.") (proclaim '(simple-vector *test-inputs*)) (defvar *test-outputs* *training-outputs* "Vector of output patterns for testing the net.") (proclaim '(simple-vector *test-outputs*)) (defvar *use-test-breaks* nil "If true, use the test breaks vector. Else, never break.") (defvar *test-breaks* *training-breaks* "If *use-test-breaks* is true, this must be a simple vector with one entry, T or NIL, for each test case. If T, zero all accumulated state before processing this case.") (proclaim '(type simple-vector *test-breaks*)) ;;; Some other data structures and parameters that control training. (defvar *max-cases* 0 "Maximum number of training cases that can be accommdated by the current data structures.") (proclaim '(fixnum *max-cases*)) (defvar *ncases* 0 "Number of training cases currently in use. Assume a contiguous block beginning with *FIRST-CASE*.") (proclaim '(fixnum *ncases*)) (defvar *first-case* 0 "Address of the first training case in the currently active set. Usually zero, but may differ if we are training on different chunks of the training set at different times.") (proclaim '(fixnum *first-case*)) ;;;; Fundamental data structures. ;;; Unit values and weights are short flonums. ;;; Instead of representing each unit by a structure, we represent the ;;; unit by a fixnum. This is used to index into various vectors that hold ;;; per-unit information, such as the activation value of each unit. ;;; So the information concerning a given unit is found in a slice of values ;;; across many vectors, all with the same unit-index. ;;; Per-connection information for each connection COMING INTO unit is ;;; stored in a vector of vectors. The outer vector is indexed by the unit ;;; number, and the inner vector is then indexed by connection number. ;;; This is a sleazy way of implementing a 2-D array, faster in most Lisp ;;; systems than multiplying to do the index arithmetic. ;;; In this version, we assume that each unit gets input from all previous ;;; units, in order. Unit 0, the "bias unit" is always at a maximum-on ;;; value. Next come some input "units", then some hidden units. The ;;; final incoming weight is the recurrent self-connection. ;;; Output units have their own separate set of data structures and ;;; indices. The units and outputs together form the "active" network. ;;; There are also separate data structures and indices for the "candidate" ;;; units that have not yet been added to the network. (defvar *max-units* 100 "Maximum number of input values and hidden units in the network.") (proclaim '(fixnum *max-units*)) (defvar *ninputs* 0 "Number of inputs for this problem.") (proclaim '(fixnum *ninputs*)) (defvar *noutputs* 0 "Number of outputs for this problem.") (proclaim '(fixnum *noutputs*)) (defvar *nunits* 0 "Current number of active units in the network. This count includes all external inputs to the network and the bias unit.") (proclaim '(fixnum *nunits*)) (defvar *ncandidates* 8 "Number of candidate units whose inputs will be trained at once.") (proclaim '(fixnum *ncandidates*)) ;;; The following vectors hold values related to hidden units in the active ;;; net and their input weights. The vectors are created by BUILD-NET, after ;;; the dimension variables have been set up. (defvar *values* (make-array 0 :element-type 'short-float) "Vector holding the current activation value for each unit and input in the active net.") (proclaim '(type (vector short-float) *values*)) (defvar *values-cache* (make-array 0) "Holds a distinct *VALUES* vector for each of the *MAX-CASES* training cases. Once we have computed the *VALUES* vector for each training case, we can use it repeatedly until the weights or training cases change.") (proclaim '(simple-vector *values-cache*)) (defvar *extra-values* (make-array 0 :element-type 'short-float) "Extra values vector to use when not using the cache.") (proclaim '(type (vector short-float) *extra-values*)) (defvar *weights* (make-array 0) "Vector of vectors. Each entry gives the weight associated with an incoming connection.") (proclaim '(simple-vector *weights*)) ;;; The following vectors hold values for the outputs of the active ;;; network and the output-side weights. (defvar *outputs* (make-array 0 :element-type 'short-float) "Vector holding the network output values.") (proclaim '(type (vector short-float) *outputs*)) (defvar *errors* (make-array 0 :element-type 'short-float) "Vector holding the current error value for each output.") (proclaim '(type (vector short-float) *errors*)) (defvar *errors-cache* (make-array 0) "Holds a distinct *ERRORS* vector for each of the *MAX-CASES* training cases. Once we have computed the *ERRORS* vector for a given training case, we can use it repeatedly until the weights of the training cases change.") (proclaim '(simple-vector *errors-cache*)) (defvar *extra-errors* (make-array 0 :element-type 'short-float) "Extra errors vector to use when not using the cache.") (proclaim '(type (vector short-float) *extra-errors*)) (defvar *sum-errors* (make-array 0 :element-type 'short-float) "Accumulate for each output the sum of the error values over a whole output training epoch.") (proclaim '(type (vector short-float) *sum-errors*)) (defvar *dummy-sum-errors* (make-array 0 :element-type 'short-float) "Replace sum-errors with this during test epochs.") (proclaim '(type (vector short-float) *dummy-sum-errors*)) (defvar *output-weights* (make-array 0) "Vector of vectors. For each output, we have a vector of output weights coming from the unit indicated by the index.") (proclaim '(simple-vector *output-weights*)) (defvar *output-deltas* (make-array 0) "Vector of vectors, parallel with output weights. Each entry is the amount by which the corresponding output weight was changed last time.") (proclaim '(simple-vector *output-deltas*)) (defvar *output-slopes* (make-array 0) "Vector of vectors, parallel with output weights. Each entry is the partial derivative of the total error with repsect to the corresponding weight.") (proclaim '(simple-vector *output-slopes*)) (defvar *output-prev-slopes* (make-array 0) "Vector of vectors, parallel with output weights. Each entry is the previous value of the corresponding *OUTPUT-SLOPES* entry.") (proclaim '(simple-vector *output-prev-slopes*)) (defvar *output-weights-record* (make-array 0) "The vector of output weights is recorded here after each output-training phase and just prior to the addition of the next unit. This record allows us to reconstruct the network's performance at each of these points in time.") (proclaim '(simple-vector *output-weights-record*)) ;;; The following vectors have one entry for each candidate unit in the ;;; pool of trainees. (defvar *cand-scores* (make-array 0 :element-type 'short-float) "A vector holding the correlation score for each candidate unit.") (proclaim '(type (vector short-float) *cand-scores*)) (defvar *cand-values* (make-array 0 :element-type 'short-float) "A vector holding the most recent output value for each candidate unit.") (proclaim '(type (vector short-float) *cand-values*)) (defvar *cand-sum-values* (make-array 0 :element-type 'short-float) "For each candidate unit, the sum of its values over an entire training set.") (proclaim '(type (vector short-float) *cand-sum-values*)) (defvar *cand-cor* (make-array 0) "A vector with one entry for each candidate unit. This entry is a vector that holds the correlation between this unit's value and the residual error at each of the outputs, computed over a whole epoch.") (proclaim '(simple-vector *cand-cor*)) (defvar *cand-prev-cor* (make-array 0) "Holds the *cand-cor* values computed in the previous candidate training epoch.") (proclaim '(simple-vector *cand-cor*)) (defvar *cand-weights* (make-array 0) "A vector with one entry for each candidate unit. This entry is a vector that holds the current input weights for that candidate unit.") (proclaim '(simple-vector *cand-weights*)) (defvar *cand-deltas* (make-array 0) "A vector with one entry for each candidate unit. This entry is a vector that holds the input weights deltas for that candidate unit.") (proclaim '(simple-vector *cand-weights*)) (defvar *cand-slopes* (make-array 0) "A vector with one entry for each candidate unit. This entry is a vector that holds the input weights slopes for that candidate unit.") (proclaim '(simple-vector *cand-slopes*)) (defvar *cand-prev-slopes* (make-array 0) "A vector with one entry for each candidate unit. This entry is a vector that holds the previous values of the input weight slopes for that candidate unit.") (proclaim '(simple-vector *cand-prev-slopes*)) (defvar *cand-derivs* (make-array 0) "A vector of vectors, parallel in structure to the *cand-weights* vector. For each weight wi, remember dV/dwi on the previous training case.") (proclaim '(simple-vector *cand-derivs*)) ;;;; Network-building utilities. (defun build-net (ninputs noutputs) "Create the network data structures, given the number of input and output connections. Get *MAX-UNITS* and other dimensions from variables." (declare (fixnum ninputs noutputs)) ;; Check to make sure *MAX-UNITS* is big enough. (unless (> *max-units* (+ ninputs 1)) (error "*MAX-UNITS* must be greater than number of inputs plus 1.")) ;; Fill in assorted variables and create top-level vectors. (setq *ninputs* ninputs *noutputs* noutputs *max-cases* (length *training-inputs*) *ncases* *max-cases* *first-case* 0 *nunits* (+ 1 *ninputs*) *values-cache* (make-array *max-cases* :initial-element nil) *extra-values* (make-array *max-units* :element-type 'short-float :initial-element 0.0) *values* *extra-values* *weights* (make-array *max-units* :initial-element nil) *outputs* (make-array *noutputs* :element-type 'short-float :initial-element 0.0) *errors-cache* (make-array *max-cases* :initial-element nil) *extra-errors* (make-array *noutputs* :element-type 'short-float :initial-element 0.0) *errors* *extra-errors* *sum-errors* (make-array *noutputs* :element-type 'short-float :initial-element 0.0) *dummy-sum-errors* (make-array *noutputs* :element-type 'short-float :initial-element 0.0) *output-weights* (make-array *noutputs* :initial-element nil) *output-weights-record* (make-array *max-units* :initial-element nil) *output-deltas* (make-array *noutputs* :initial-element nil) *output-slopes* (make-array *noutputs* :initial-element nil) *output-prev-slopes* (make-array *noutputs* :initial-element nil) *cand-values* (make-array *ncandidates* :element-type 'short-float :initial-element 0.0) *cand-sum-values* (make-array *ncandidates* :element-type 'short-float :initial-element 0.0) *cand-scores* (make-array *ncandidates* :element-type 'short-float :initial-element 0.0) *cand-cor* (make-array *ncandidates* :initial-element nil) *cand-prev-cor* (make-array *ncandidates* :initial-element nil) *cand-weights* (make-array *ncandidates* :initial-element nil) *cand-deltas* (make-array *ncandidates* :initial-element nil) *cand-slopes* (make-array *ncandidates* :initial-element nil) *cand-prev-slopes* (make-array *ncandidates* :initial-element nil) *cand-derivs* (make-array *ncandidates* :initial-element nil)) ;; Only create the caches if *USE-CACHE* is on -- may not always have room. (when *use-cache* (dotimes1 (i *max-cases*) (setf (svref *values-cache* i) (make-array *max-units* :element-type 'short-float :initial-element 0.0)) (setf (svref *errors-cache* i) (make-array *noutputs* :element-type 'short-float :initial-element 0.0)))) ;; For each output, create the vectors holding per-weight information. (dotimes1 (i *noutputs*) (setf (svref *output-weights* i) (make-array *max-units* :element-type 'short-float :initial-element 0.0)) (setf (svref *output-deltas* i) (make-array *max-units* :element-type 'short-float :initial-element 0.0)) (setf (svref *output-slopes* i) (make-array *max-units* :element-type 'short-float :initial-element 0.0)) (setf (svref *output-prev-slopes* i) (make-array *max-units* :element-type 'short-float :initial-element 0.0))) ;; For each candidate unit, create the vectors holding the correlations, ;; incoming weights, and other stats. (dotimes1 (i *ncandidates*) (setf (svref *cand-cor* i) (make-array *noutputs* :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-prev-cor* i) (make-array *noutputs* :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-weights* i) (make-array (+ *max-units* 1) :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-deltas* i) (make-array (+ *max-units* 1) :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-slopes* i) (make-array (+ *max-units* 1) :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-prev-slopes* i) (make-array (+ *max-units* 1) :element-type 'short-float :initial-element 0.0)) (setf (svref *cand-derivs* i) (make-array (+ *max-units* 1) :element-type 'short-float :initial-element 0.0)))) (defun random-weight () "Select a random weight, uniformly distributed over the interval from minus to plus *weight-range*." (-sf (random (*sf 2.0 *weight-range*)) *weight-range*)) (defun init-net () "Set up the network for a learning problem. Clean up all the data structures that may have become corrupted. Initialize the output weights to random values controlled by *weight-range*." ;; Initialize the active unit data structures. (dotimes1 (i *max-units*) (setf (fvref *extra-values* i) 0.0) (setf (svref *weights* i) nil) (setf (svref *output-weights-record* i) nil)) (setq *nunits* (+ 1 *ninputs*)) ;; Initialize the per-output data structures. (dotimes1 (i *noutputs*) (setf (fvref *outputs* i) 0.0) (setf (fvref *extra-errors* i) 0.0) (setf (fvref *sum-errors* i) 0.0) (setf (fvref *dummy-sum-errors* i) 0.0) (let ((ow (svref *output-weights* i)) (od (svref *output-deltas* i)) (os (svref *output-slopes* i)) (op (svref *output-prev-slopes* i))) (dotimes1 (j *max-units*) (setf (fvref ow j) 0.0) (setf (fvref od j) 0.0) (setf (fvref os j) 0.0) (setf (fvref op j) 0.0)) ;; Set up initial random weights for the input-to-output connections. (dotimes1 (j (1+ *ninputs*)) (setf (fvref ow j) (random-weight))))) ;; Initialize the caches if they are in use. (when *use-cache* (dotimes1 (j *max-cases*) (let ((v (svref *values-cache* j)) (e (svref *errors-cache* j))) (dotimes1 (i *max-units*) (setf (fvref v i) 0.0)) (dotimes1 (i *noutputs*) (setf (fvref e i) 0.0))))) ;; Candidate units get initialized in a separate routine. (init-candidates) ;; Do some other assorted housekeeping. (setf (fvref *extra-values* 0) 1.0) (setq *epoch* 0) (setq *nunits* (+ 1 *ninputs*)) (setq *error-bits* 0) (setq *true-error* 0.0) (setq *sum-sq-error* 0.0) (setq *best-candidate-score* 0.0) (setq *best-candidate* 0)) (defun changed-training-set () "Call this instead of BUILD-NET and INIT-NET if you want to leave existing hidden units in place and start from here with new training examples. Assumes that the number of net inputs and outputs remains the same, but the number of cases may have changed. Rebuilds the caches." (setq *max-cases* (length *training-inputs*) *ncases* *max-cases* *first-case* 0 *values-cache* (make-array *max-cases* :initial-element nil) *errors-cache* (make-array *max-cases* :initial-element nil)) ;; Only create the caches if *USE-CACHE* is on -- may not always have room. (when *use-cache* (let ((prev-values (make-array *max-units* :element-type 'short-float :initial-element 0.0))) (dotimes1 (i *max-cases*) (setf (svref *errors-cache* i) (make-array *noutputs* :element-type 'short-float :initial-element 0.0)) (setq *values* (make-array *max-units* :element-type 'short-float :initial-element 0.0)) (setf (svref *values-cache* i) *values*) (set-up-inputs (svref *training-inputs* i)) (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (compute-unit-value j (if (and *use-training-breaks* (svref *training-breaks* i)) 0.0 (fvref prev-values j)))) (setq prev-values *values*))))) ;;;; Utilities for learning. (proclaim '(inline activation activation-prime)) (defun activation (sum) (declare (short-float sum)) "Given the sum of weighted inputs, compute the unit's activation value. Defined unit types are :sigmoid, :asigmoid, and :gaussian." (ecase *unit-type* (:sigmoid ;; Symmetric sigmoid function in range -0.5 to +0.5. (cond ((< sum -15.0) -0.5) ((> sum 15.0) +0.5) (t (-sf (/sf 1.0 (+sf 1.0 (exp (-sf sum)))) 0.5)))) (:asigmoid ;; Asymmetric sigmoid in range 0.0 to 1.0. (cond ((< sum -15.0) 0.0) ((> sum 15.0) 1.0) (t (/sf 1.0 (+sf 1.0 (exp (-sf sum))))))) (:gaussian ;; Gaussian activation function in range 0.0 to 1.0. (let ((x (*sf -0.5 sum sum))) (if (< x -75.0) 0.0 (exp x)))))) ;;; Note: do not use *SIGMOID-PRIME-OFFSET* here, as it confuses the ;;; correlation machinery. But do use it in output-prime, since it does no ;;; harm there and the output units often get stuck at extreme values. (defun activation-prime (value sum) "Given the unit's activation value and sum of weighted inputs, compute the derivative of the activation with respect to the sum. Defined unit types are :sigmoid, :asigmoid, and :gaussian." (declare (short-float value sum)) (ecase *unit-type* (:sigmoid (-sf 0.25 (*sf value value))) (:asigmoid (*sf value (-sf 1.0 value))) (:gaussian (*sf (-sf value) sum)))) (proclaim '(inline output-function output-prime)) (defun output-function (sum) "Compute the value of an output, given the weighted sum of incoming values. Defined output types are :sigmoid and :linear." (declare (short-float sum)) (ecase *output-type* (:sigmoid (cond ((< sum -15.0) -0.5) ((> sum 15.0) +0.5) (t (-sf (/sf 1.0 (+sf 1.0 (exp (-sf sum)))) 0.5)))) (:linear sum))) (defun output-prime (output) "Compute the derivative of an output with respect to the weighted sum of incoming values. Defined output types are :sigmoid and :linear." (declare (short-float output)) (ecase *output-type* (:sigmoid (+sf *sigmoid-prime-offset* (-sf 0.25 (*sf output output)))) (:linear 1.0))) ;;; The basic routine for doing Quickprop-style update of weights. ;;; Distilled essence of a year's work... (proclaim '(inline quickprop-update)) (defun quickprop-update (i weights deltas slopes prevs epsilon decay mu shrink-factor) "Given vectors holding weights, deltas, slopes, and previous slopes, and an index i, update weight(i) and delta(i) appropriately. Move slope(i) to prev(i) and zero out slope(i). Add weight decay term to each slope before doing the update." (let* ((w (fvref weights i)) (d (fvref deltas i)) (s (+sf (fvref slopes i) (*sf decay w))) (p (fvref prevs i)) (next-step 0.0)) (declare (short-float w p s d next-step)) ;; The step must always be downhill. (cond ;; If last step was negative... ((minusp d) ;; First, add in linear term if current slope is still positive. (when (plusp s) (decf-sf next-step (*sf epsilon s))) (cond ;; If current slope is close to or larger than prev slope... ((>= s (*sf shrink-factor p)) ;; Take maximum size negative step. (incf-sf next-step (*sf mu d))) ;; Else, use quadratic estimate. (t (incf-sf next-step (*sf d (/sf s (-sf p s))))))) ;; If last step was positive... ((plusp d) ;; First, add in linear term if current slope is still negative. (when (minusp s) (decf-sf next-step (*sf epsilon s))) (cond ;; If current slope is close to or more neg than prev slope... ((<= s (*sf shrink-factor p)) ;; Take maximum size positive step. (incf-sf next-step (*sf mu d))) ;; Else, use quadratic estimate. (t (incf-sf next-step (*sf d (/sf s (-sf p s))))))) ;; Last step was zero, so use only linear term. (t (decf-sf next-step (*sf epsilon s)))) ;; Having computed the next step, update the data vectors. (setf (fvref deltas i) next-step) (setf (fvref weights i) (+sf w next-step)) (setf (fvref prevs i) s) (setf (fvref slopes i) 0.0) nil)) ;;;; Machinery for training output weights. (defun set-up-inputs (input) "Set up all the inputs from the INPUT vector as the first few entries in in the values vector." (setf (fvref *values* 0) 1.0) (dotimes1 (i *ninputs*) (setf (fvref *values* (1+ i)) (the short-float (svref input i))))) (defun output-forward-pass () "Assume the *VALUES* vector has been set up. Just compute the network's outputs." (dotimes1 (j *noutputs*) (let ((ow (svref *output-weights* j)) (sum 0.0)) (declare (short-float sum)) (dotimes1 (i *nunits*) (incf-sf sum (*sf (fvref *values* i) (fvref ow i)))) (setf (fvref *outputs* j) (output-function sum))))) (proclaim '(inline compute-unit-value)) (defun compute-unit-value (j prev-value) "Assume that *VALUES* vector has correct current values for all units with index less than J. Compute, record, and return the value for unit J. PREV-VALUE is is the previous value of unit J." (declare (fixnum j) (short-float prev-value)) (let* ((w (svref *weights* j)) (sum 0.0)) (declare (short-float sum)) (dotimes1 (i j) (incf-sf sum (*sf (fvref *values* i) (fvref w i)))) (incf-sf sum (*sf prev-value (fvref w j))) (setf (fvref *values* j) (activation sum)))) (defun full-forward-pass (input no-memory) "This is called only when not using the cache. Set up the inputs from the INPUT vector, then propagate activation values forward through all hidden units and output units. If no-memory is T, assume the previous unit values are all zero." (set-up-inputs input) ;; For each hidden unit J, compute the activation value. Assume that the ;; *values* vector is full of values from the previous training pattern. (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (compute-unit-value j (if no-memory 0.0 (fvref *values* j)))) ;; Now compute outputs. (output-forward-pass)) (defmacro compute-errors (goal err-bits true-err sum-sq-err slopes-p) "GOAL is a vector of desired outputs. Compute and record the error statistics, incrementing the ERR-BITS, TRUE-ERR, and SUM-SQ-ERR variables, and the proper entry in *SUM-ERRORS*. If SLOPES-P is true, also compute and record the slopes for output weights." `(dotimes1 (j *noutputs*) (let* ((out (fvref *outputs* j)) (dif (-sf out (svref ,goal j))) (err-prime (*sf dif (output-prime out))) ,@(when slopes-p '((os (svref *output-slopes* j))))) (declare (short-float dif err-prime)) (unless (< (abs dif) *score-threshold*) (incf ,err-bits)) (incf-sf ,true-err (*sf dif dif)) (setf (fvref *errors* j) err-prime) (incf-sf (fvref *sum-errors* j) err-prime) (incf-sf ,sum-sq-err (*sf err-prime err-prime)) ,@(when slopes-p '((dotimes1 (i *nunits*) (incf-sf (fvref os i) (*sf err-prime (fvref *values* i))))))))) (defmacro recompute-errors (goal) "Like compute errors, but don't bother updating slopes and statistics." `(dotimes1 (j *noutputs*) (let* ((out (fvref *outputs* j)) (dif (-sf out (svref ,goal j))) (err-prime (*sf dif (output-prime out)))) (declare (short-float dif err-prime)) (setf (fvref *errors* j) err-prime)))) ;;; Note: Scaling *OUTPUT-EPSILON* by the number of cases seems to keep the ;;; quickprop update in a good range across many different-sized training ;;; sets, but it's something of a hack. Choosing good epsilon values ;;; still requires some trial and error. (defun update-output-weights () "Update the output weights, using the pre-computed slopes, prev-slopes, and delta values. Uses the quickprop update function." (let ((eps (/ *output-epsilon* *ncases*))) (dotimes1 (j *noutputs*) (let ((ow (svref *output-weights* j)) (od (svref *output-deltas* j)) (os (svref *output-slopes* j)) (op (svref *output-prev-slopes* j))) (dotimes1 (i *nunits*) (quickprop-update i ow od os op eps *output-decay* *output-mu* *output-shrink-factor*)))))) ;;;; Outer loops for training output weights. (defun train-outputs-epoch () "Perform forward propagation once for each set of weights in the training vectors, computing errors and slopes. Then update the output weights." (let ((err-bits 0) (true-err 0.0) (sum-sq-err 0.0)) (declare (fixnum err-bits) (short-float true-err sum-sq-err)) (dotimes1 (o *noutputs*) (setf (fvref *sum-errors* o) 0.0)) ;; User may have changed mu between epochs, so fix shrink-factor. (setq *output-shrink-factor* (/sf *output-mu* (+sf 1.0 *output-mu*))) ;; Initialize values vector. (unless *use-cache* (setq *values* *extra-values*) (setq *errors* *extra-errors*) (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (setf (fvref *values* j) 0.0))) ;; Now run through the training examples. (do ((i *first-case* (1+ i))) ((= i (the fixnum (+ *first-case* *ncases*)))) (declare (fixnum i)) (cond (*use-cache* (setq *values* (svref *values-cache* i)) (setq *errors* (svref *errors-cache* i)) (output-forward-pass)) (t (full-forward-pass (svref *training-inputs* i) (and *use-training-breaks* (svref *training-breaks* i))))) (compute-errors (svref *training-outputs* i) err-bits true-err sum-sq-err t)) (setq *error-bits* err-bits) (setq *true-error* (+sf 0.0 true-err)) (setq *sum-sq-error* (+sf 0.0 sum-sq-err)) ;; Do not change weights or count epoch if this run was perfect. (unless (= 0 *error-bits*) (update-output-weights) (incf *epoch*)))) (defun record-output-weights () "Store the output weights developed after each output-training phase in the *output-weights-record* vector." (let ((record (make-array *noutputs* :initial-element nil))) (dotimes1 (o *noutputs*) (let ((original (svref *output-weights* o)) (copy (make-array *nunits* :element-type 'short-float :initial-element 0.0))) (dotimes1 (u *nunits*) (setf (fvref copy u) (fvref original u))) (setf (svref record o) copy))) (setf (svref *output-weights-record* (1- *nunits*)) record))) (defun train-outputs (max-epochs) "Train the output weights. If we exhaust MAX-EPOCHS, stop with value :TIMEOUT. If there are zero error bits, stop with value :WIN. Else, keep going until the true error has not changed by a significant amount for *OUTPUT-PATIENCE* epochs. Then return :STAGNANT. If *OUTPUT-PATIENCE* is zero, we do not stop until victory or until MAX-EPOCHS is used up." (declare (fixnum max-epochs)) (let ((last-error 0.0) (quit-epoch (+ *epoch* *output-patience*)) (first-time t)) (declare (fixnum quit-epoch) (short-float last-error)) (dotimes1 (i max-epochs (progn (record-output-weights) :timeout)) ;; Maybe run a test epoch to see how we're doing. (when (and *test* (not (= 0 *test-interval*)) (= 0 (mod i *test-interval*))) (test-epoch)) (train-outputs-epoch) (cond ((zerop *error-bits*) (record-output-weights) (return :win)) ((zerop *output-patience*)) (first-time (setq first-time nil) (setq last-error *true-error*)) ((> (abs (- *true-error* last-error)) (* last-error *output-change-threshold*)) (setq last-error *true-error*) (setq quit-epoch (+ *epoch* *output-patience*))) ((>= *epoch* quit-epoch) (record-output-weights) (return :stagnant)))))) ;;;; Machinery for Training, Selecting, and Installing Candidate Units. (defun init-candidates () "Give new random weights to all of the candidate units. Zero the other candidate-unit statistics." (dotimes1 (i *ncandidates*) (setf (fvref *cand-values* i) 0.0) (setf (fvref *cand-sum-values* i) 0.0) (setf (fvref *cand-scores* i) 0.0) (let ((cw (svref *cand-weights* i)) (cd (svref *cand-deltas* i)) (cs (svref *cand-slopes* i)) (cp (svref *cand-prev-slopes* i)) (cc (svref *cand-cor* i)) (cpc (svref *cand-prev-cor* i)) (cdv (svref *cand-derivs* i))) (dotimes1 (j (+ *nunits* 1)) (setf (fvref cw j) (random-weight)) (setf (fvref cd j) 0.0) (setf (fvref cs j) 0.0) (setf (fvref cp j) 0.0) (setf (fvref cdv j) 0.0)) (dotimes1 (o *noutputs*) (setf (fvref cc o) 0.0) (setf (fvref cpc o) 0.0))))) (defun install-new-unit () "Add the candidate-unit with the best correlation score to the active network. Then reinitialize the candidate pool." (when (>= *nunits* *max-units*) (error "Cannot add any more units.")) ;; Copy the weight vector for the new unit. (let ((w (make-array (+ *nunits* 1) :element-type 'short-float)) (cw (svref *cand-weights* *best-candidate*))) (dotimes1 (i (+ *nunits* 1)) (setf (fvref w i) (fvref cw i))) (setf (svref *weights* *nunits*) w) ;; Tell user about the new unit. (format t " Add unit ~S: ~S~%" (+ 1 *nunits*) w)) ;; Fix up output weights for candidate unit. ;; Use minus the correlation times the *weight-multiplier* as an ;; initial guess. At least the sign should be right. (dotimes1 (o *noutputs*) (setf (fvref (svref *output-weights* o) *nunits*) (*sf (-sf (fvref (svref *cand-prev-cor* *best-candidate*) o)) *weight-multiplier*))) ;; If using cache, run an epoch to compute this unit's values. (when *use-cache* (let ((prev-value 0.0)) (dotimes1 (i *max-cases*) (setq *values* (svref *values-cache* i)) (setq prev-value (compute-unit-value *nunits* (if (and *use-training-breaks* (svref *training-breaks* i)) 0.0 prev-value)))))) ;; Reinitialize candidate units with random weights. (incf *nunits*) (init-candidates)) ;;; Note: Ideally, after each adjustment of the candidate weights, we would ;;; run two epochs. The first would just determine the correlations ;;; between the candidate unit outputs and the residual error. Then, in a ;;; second pass, we would adjust each candidate's input weights so as to ;;; maximize the absolute value of the correlation. We need to know the ;;; sign of the correlation for each candidate-output pair so that we know ;;; which direction to tune the input weights. ;;; Since this ideal method doubles the number of epochs required for ;;; training candidates, we cheat slightly and use the correlation values ;;; computed BEFORE the most recent weight update. This combines the two ;;; epochs, saving us almost a factor of two. To bootstrap the process, we ;;; begin with a single epoch that computes only the correlation. ;;; Since we look only at the sign of the correlation and since that sign ;;; should change very infrequently, this probably is OK. But keep a ;;; lookout for pathological situations in which this might cause ;;; oscillation. ;;; This function is used only once at the start of each output-training ;;; phase to prime the pump. After that, each call to compute-slopes also ;;; computes the error-value products for the next epoch. (defun compute-correlations (no-memory) "For the current training pattern, compute the value of each candidate unit and begin to compute the correlation between that unit's value and the error at each output. We have already done a forward-prop and computed the error values for active units." (dotimes1 (u *ncandidates*) (let ((sum 0.0) (v 0.0) (cw (svref *cand-weights* u)) (cc (svref *cand-cor* u))) (declare (short-float sum v)) ;; Determine activation value of each candidate unit. (dotimes1 (i *nunits*) (incf-sf sum (*sf (fvref cw i) (fvref *values* i)))) ;; Maybe add in the activation from the auto-recursive weight. (unless no-memory (incf-sf sum (*sf (fvref cw *nunits*) (fvref *cand-values* u)))) (setq v (activation sum)) (setf (fvref *cand-values* u) v) (incf-sf (fvref *cand-sum-values* u) v) ;; Accumulate value of each unit times error at each output. (dotimes1 (o *noutputs*) (incf-sf (fvref cc o) (*sf v (fvref *errors* o))))))) ;;; Note: When we were computing true correlations between candidates and ;;; outputs, this is where the normalization factors went in. Currently we ;;; are just using covariances, as explained in the tech report. So we ;;; make only two adjustments here. First, we subtract out the product of ;;; the mean error and the mean candidate value to keep things from ;;; exploding when the error has a non-zero mean. Second, we effectively ;;; scale the error values by the sum-squared error over all training ;;; cases. This just keeps us from having to adjust *input-epsilon* ;;; repeatedly as the error is gradually reduced to a small fraction of its ;;; initial size. (defun adjust-correlations () "Normalize each accumulated correlation value, and stuff the normalized form into the *cand-prev-cor* data structure. Then zero *cand-cor* to prepare for the next round. Note the unit with the best total correlation score." (setq *best-candidate* 0) (setq *best-candidate-score* 0.0) (dotimes1 (u *ncandidates*) (let* ((cc (svref *cand-cor* u)) (cpc (svref *cand-prev-cor* u)) (avg-value (/ (fvref *cand-sum-values* u) *ncases*)) (cor 0.0) (score 0.0)) (declare (short-float avg-value cor score)) (dotimes1 (o *noutputs*) (setq cor (/sf (-sf (fvref cc o) (*sf avg-value (fvref *sum-errors* o))) *sum-sq-error*)) (setf (fvref cpc o) cor) (setf (fvref cc o) 0.0) (incf-sf score (abs cor))) ;; Keep track of the candidate with the best overall correlation. (setf (fvref *cand-scores* u) score) (when (> score *best-candidate-score*) (setq *best-candidate-score* (+sf 0.0 score)) (setq *best-candidate* u))))) ;;; This is the key function in the candidate training process. (defun compute-slopes (no-memory) "Given the correlation values for each candidate-output pair, compute the derivative of the candidate's score with respect to each incoming weight." (dotimes1 (u *ncandidates*) (let* ((sum 0.0) (value 0.0) (actprime 0.0) (direction 0.0) (dsum 0.0) (cw (svref *cand-weights* u)) (cs (svref *cand-slopes* u)) (cc (svref *cand-cor* u)) (cpc (svref *cand-prev-cor* u)) (cdv (svref *cand-derivs* u)) (ws (fvref cw *nunits*))) (declare (short-float sum value actprime direction dsum ws)) ;; Forward pass through each candidate unit to compute activation-prime. (dotimes1 (i *nunits*) (incf-sf sum (*sf (fvref cw i) (fvref *values* i)))) ;; Maybe add in the activation from the auto-recursive weight. (unless no-memory (incf-sf sum (*sf ws (fvref *cand-values* u)))) (setq value (activation sum)) (setq actprime (activation-prime value sum)) ;; Now compute which way we want to adjust each unit's incoming ;; activation sum, and how much. (dotimes1 (o *noutputs*) (let ((error (fvref *errors* o))) (decf-sf direction (*sf (if (minusp (fvref cpc o)) -1.0 1.0) (/sf (-sf error (fvref *sum-errors* o)) *sum-sq-error*))) ;; Also accumulate the error-value products for use next epoch. (incf-sf (fvref cc o) (*sf error value)))) ;; Given the direction we want to push the unit's sum, compute ;; which way we want to tweak each incoming weight. (dotimes1 (i *nunits*) (setq dsum (*sf actprime (+sf (fvref *values* i) (if no-memory 0.0 (*sf ws (fvref cdv i)))))) (incf-sf (fvref cs i) (*sf direction dsum)) (setf (fvref cdv i) dsum)) (unless no-memory ;; Compute derivative of activation sum with respect to the unit's ;; auto-recurrent weight. (setq dsum (*sf actprime (+sf (fvref *cand-values* u) (*sf ws (fvref cdv *nunits*))))) (incf-sf (fvref cs *nunits*) (*sf direction dsum)) (setf (fvref cdv *nunits*) dsum)) ;; Save unit value for use in next training case. (setf (fvref *cand-values* u) value) (incf-sf (fvref *cand-sum-values* u) value)))) ;;; Note: Scaling *INPUT-EPSILON* by the number of cases and number of ;;; inputs to each unit seems to keep the quickprop update in a good range, ;;; as the network goes from small to large, and across many ;;; different-sized training sets. Still, choosing a good epsilon value ;;; requires some trial and error. (defun update-input-weights () "Update the input weights, using the pre-computed slopes, prev-slopes, and delta values. Uses the quickprop update function." (let ((eps (/ *input-epsilon* (* *ncases* *nunits*)))) (dotimes1 (u *ncandidates*) (let ((cw (svref *cand-weights* u)) (cd (svref *cand-deltas* u)) (cs (svref *cand-slopes* u)) (cp (svref *cand-prev-slopes* u))) (dotimes1 (i (+ *nunits* 1)) (quickprop-update i cw cd cs cp eps *input-decay* *input-mu* *input-shrink-factor*)))))) ;;; Outer loop for training the candidate unit(s). (defun train-inputs-epoch () "For each training pattern, perform a forward pass. Tune the candidate units' weights to maximize the correlation score of each." ;; Initialize some things. (dotimes1 (u *ncandidates*) (setf (fvref *cand-values* u) 0.0) (setf (fvref *cand-sum-values* u) 0.0)) (unless *use-cache* (setq *values* *extra-values*) (setq *errors* *extra-errors*) (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (setf (fvref *values* j) 0.0))) ;; Now run through all the training examples. (do ((i *first-case* (1+ i))) ((= i (+ *first-case* *ncases*))) (declare (fixnum i)) (let ((no-memory (and *use-training-breaks* (svref *training-breaks* i)))) ;; Compute values and errors, or recall cached values. (cond (*use-cache* (setq *values* (svref *values-cache* i)) (setq *errors* (svref *errors-cache* i))) (t (full-forward-pass (svref *training-inputs* i) no-memory) (recompute-errors (svref *training-outputs* i)))) ;; Compute the slopes we will use to adjust candidate weights. (compute-slopes no-memory))) ;; User may have changed mu between epochs, so fix shrink-factor. (setq *input-shrink-factor* (/sf *input-mu* (+sf 1.0 *input-mu*))) ;; Now adjust the candidate unit input weights using quickprop. (update-input-weights) ;; Fix up the correlation values for the next epoch. (adjust-correlations) (incf *epoch*)) (defun correlations-epoch () "Do an epoch through all active training patterns just to compute the initial correlations. After this one pass, we will update the correlations as we train." ;; Initialize some things. (dotimes1 (u *ncandidates*) (setf (fvref *cand-values* u) 0.0) (setf (fvref *cand-sum-values* u) 0.0)) (unless *use-cache* (setq *values* *extra-values*) (setq *errors* *extra-errors*) (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (setf (fvref *values* j) 0.0))) ;; Now run through all the training examples. (do ((i *first-case* (1+ i))) ((= i (+ *first-case* *ncases*))) (declare (fixnum i)) (let ((no-memory (and *use-training-breaks* (svref *training-breaks* i)))) (cond (*use-cache* (setq *values* (svref *values-cache* i)) (setq *errors* (svref *errors-cache* i))) (t (full-forward-pass (svref *training-inputs* i) no-memory) (recompute-errors (svref *training-outputs* i)))) (compute-correlations no-memory))) (adjust-correlations) (incf *epoch*)) (defun train-inputs (max-epochs) "Train the input weights of all candidates. If we exhaust MAX-EPOCHS, stop with value :TIMEOUT. Else, keep going until the best candidate unit's score has changed by a significant amount, and then until it does not change significantly for PATIENCE epochs. Then return :STAGNANT. If PATIENCE is zero, we do not stop until victory or until MAX-EPOCHS is used up." (declare (fixnum max-epochs)) ;; Turn sum-errors into average errors. (dotimes1 (o *noutputs*) (setf (fvref *sum-errors* o) (/ (fvref *sum-errors* o) *ncases*))) (correlations-epoch) (let ((last-score 0.0) (quit max-epochs) (first-time t)) (declare (fixnum quit) (short-float last-score)) (dotimes1 (i max-epochs :timeout) (train-inputs-epoch) (cond ((zerop *input-patience*)) (first-time (setq first-time nil) (setq last-score *best-candidate-score*)) ((> (abs (-sf *best-candidate-score* last-score)) (* last-score *input-change-threshold*)) (setq last-score *best-candidate-score*) (setq quit (+ i *input-patience*))) ((>= i quit) (return :stagnant)))))) ;;;; Outer Loop. (defun list-parameters () "Print out the current training parameters in abbreviated form." (format t "SigOff ~,2F, WtRng ~,2F, WtMul ~,2F~%" *sigmoid-prime-offset* *weight-range* *weight-multiplier*) (format t "OMu ~,2F, OEps ~,2F, ODcy ~,4F, OPat ~D, OChange ~,3F~%" *output-mu* *output-epsilon* *output-decay* *output-patience* *output-change-threshold*) (format t "IMu ~,2F, IEps ~,2F, IDcy ~,4F, IPat ~D, IChange ~,3F~%" *input-mu* *input-epsilon* *input-decay* *input-patience* *input-change-threshold*) (format t "Utype ~S, Otype ~S, Pool ~D~%" *unit-type* *output-type* *ncandidates*)) (defun train (outlimit inlimit rounds &optional (restart nil)) "Train the output weights until stagnation or victory is reached. Then train the input weights to stagnation or victory. Then install the best candidate unit and repeat. OUTLIMIT and INLIMIT are upper limits on the number of cycles in each output and input phase. ROUNDS is an upper limit on the number of unit-installation cycles. If RESTART is non-nil, we are restarting training from the current point -- do not reinitialize the net." (declare (fixnum outlimit inlimit rounds)) (unless restart (init-net)) (list-parameters) (when *use-cache* (dotimes1 (i *max-cases*) (setq *values* (svref *values-cache* i)) (set-up-inputs (svref *training-inputs* i)))) (setq *done* nil) (dotimes1 (r rounds :lose) (case (train-outputs outlimit) (:win (list-parameters) (format t "Victory at ~S epochs, ~S units, ~S hidden, Error ~S.~%" *epoch* *nunits* (- *nunits* *ninputs* 1) *true-error*) (return nil)) (:timeout (format t "Epoch ~D: Out Timeout ~D bits wrong, error ~S.~2%" *epoch* *error-bits* *true-error*)) (:stagnant (format t "Epoch ~D: Out Stagnant ~D bits wrong, error ~S.~2%" *epoch* *error-bits* *true-error*))) (when *test* (test-epoch)) (when *done* (return nil)) (case (train-inputs inlimit) (:timeout (format t "Epoch ~D: In Timeout. Cor: ~D~%" *epoch* *best-candidate-score*)) (:stagnant (format t "Epoch ~D: In Stagnant. Cor: ~D~%" *epoch* *best-candidate-score*))) (install-new-unit))) (defun test-epoch (&optional (*score-threshold* 0.49999)) "Perform forward propagation once for each set of weights in the training and testing vectors. Reporting the performance. Do not change any weights. Do not use the caches." (let ((*use-cache* nil) (*values* *extra-values*) (*errors* *extra-errors*) (*sum-errors* *dummy-sum-errors*) (train-err-bits 0) (test-err-bits 0) (train-true-err 0.0) (test-true-err 0.0) (sum-sq-err 0.0)) (declare (fixnum train-err-bits test-err-bits) (short-float train-true-err test-true-err sum-sq-err)) ;; Zero context at the start of the training set. (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (setf (fvref *values* j) 0.0)) ;; Run all training patterns and count errors. (dotimes1 (i (length *training-inputs*)) (full-forward-pass (svref *training-inputs* i) (and *use-training-breaks* (svref *training-breaks* i))) (compute-errors (svref *training-outputs* i) train-err-bits train-true-err sum-sq-err nil)) (format t "Training: ~D of ~D wrong, error ~S." train-err-bits (length *training-inputs*) train-true-err) ;; Zero context at the start of the test set. (do ((j (1+ *ninputs*) (1+ j))) ((= j *nunits*)) (declare (fixnum j)) (setf (fvref *values* j) 0.0)) ;; Now run all test patterns and report the results. (when *test-inputs* (dotimes1 (i (length *test-inputs*)) (full-forward-pass (svref *test-inputs* i) (and *use-test-breaks* (svref *test-breaks* i))) (compute-errors (svref *test-outputs* i) test-err-bits test-true-err sum-sq-err nil))) (format t " Test: ~D of ~D wrong, error ~S.~%" test-err-bits (length *test-inputs*) test-true-err))) ;;; *************************************************************************** ;;; Example of usage: code to create morse code test described in the paper. ;;; *************************************************************************** (proclaim '(optimize (speed 1) (safety 3))) (defvar *nletters* 26 "Number of letters we will ultimately train the current network on. This controls number of outputs, etc.") (proclaim '(fixnum *nletters*)) (defvar *code-conversions* (make-array 27) "Given index of the desired letter (A is 1, etc.), get the code string in dots and dashes. Reserve ouptput zero for the strobe.") (proclaim '(simple-vector *code-conversions*)) ;;; Set up the codes. Letter (setf (aref *code-conversions* 1) ".-" ) ; A (setf (aref *code-conversions* 2) "-...") ; B (setf (aref *code-conversions* 3) "-.-.") ; C (setf (aref *code-conversions* 4) "-.." ) ; D (setf (aref *code-conversions* 5) "." ) ; E (setf (aref *code-conversions* 6) "..-.") ; F (setf (aref *code-conversions* 7) "--." ) ; G (setf (aref *code-conversions* 8) "....") ; H (setf (aref *code-conversions* 9) ".." ) ; I (setf (aref *code-conversions* 10) ".---") ; J (setf (aref *code-conversions* 11) "-.-" ) ; K (setf (aref *code-conversions* 12) ".-..") ; L (setf (aref *code-conversions* 13) "--" ) ; M (setf (aref *code-conversions* 14) "-." ) ; N (setf (aref *code-conversions* 15) "---" ) ; O (setf (aref *code-conversions* 16) ".--.") ; P (setf (aref *code-conversions* 17) "--.-") ; Q (setf (aref *code-conversions* 18) ".-." ) ; R (setf (aref *code-conversions* 19) "..." ) ; S (setf (aref *code-conversions* 20) "-" ) ; T (setf (aref *code-conversions* 21) "..-" ) ; U (setf (aref *code-conversions* 22) "...-") ; V (setf (aref *code-conversions* 23) ".--" ) ; W (setf (aref *code-conversions* 24) "-..-") ; X (setf (aref *code-conversions* 25) "-.--") ; Y (setf (aref *code-conversions* 26) "--..") ; Z (defun string-to-code (string) "Given a string of characters, returns three values: a vector of input patterns in the proper morse-code representation, a corresponding vector of output patterns, and a vector of break values with a T at the start of each new character." (setq string (string-upcase string)) (let* ((inlist nil) (outlist nil) (breaklist nil)) (dotimes (i (length string)) (let* ((char (- (char-code (char string i)) 64)) (morse (aref *code-conversions* char)) (outpat (make-array (+ *nletters* 1) :initial-element -0.5)) (strobepat (make-array (+ *nletters* 1) :initial-element -0.5))) (setf (aref strobepat 0) +0.5) (setf (aref strobepat char) +0.5) (dotimes (j (length morse)) ;; Push out the dot-dash patterns. (if (eql (aref morse j) #\.) (progn (push '#(+0.5) inlist) (push '#(-0.5) inlist) (push outpat outlist) (push outpat outlist) (push (= j 0) breaklist) (push nil breaklist)) (progn (push '#(+0.5) inlist) (push '#(+0.5) inlist) (push '#(-0.5) inlist) (push outpat outlist) (push outpat outlist) (push outpat outlist) (push (= j 0) breaklist) (push nil breaklist) (push nil breaklist)))) ;; Push the separating space and the strobe output. (push strobepat outlist) (push '#(-0.5) inlist) (push nil breaklist))) ;; Return the three values. (values (coerce (nreverse inlist) 'simple-vector) (coerce (nreverse outlist) 'simple-vector) (coerce (nreverse breaklist) 'simple-vector)))) (defvar *training-string* nil "Save the string used for training cases here.") (defvar *test-string* nil "Save the string used for testing cases here.") (defun build-morse (string &optional (continue nil)) "Given a string of characters, create a training set for the morse code representation of the string. There is no test set. If CONTINUE is on, make use of pre-existing hidden units." (setq *training-string* string) (multiple-value-bind (in out breaks) (string-to-code string) (setq *training-inputs* in) (setq *training-outputs* out) (setq *training-breaks* breaks) (setq *use-training-breaks* t)) (setq *test-inputs* *training-inputs*) (setq *test-outputs* *training-outputs*) (setq *test-breaks* *training-breaks*) (setq *test-string* nil) (setq *use-test-breaks* t) (setq *ninputs* 1) (setq *noutputs* (+ *nletters* 1)) (if continue (changed-training-set) (progn (build-net *ninputs* *noutputs*) (init-net))) (format t "~%Training on ~S:~%" string)) ;;; To run this, do something like the following: ;;; (build-morse "abcdefghijklmnopqrstuvwxyz") ;;; (train 150 150 25) ;;; ;;; Suggested parameters: ;;; SigOff 0.10, WtRng 1.00, WtMul 1.00 ;;; OMu 2.00, OEps 0.01, ODcy 0.0001, OPat 15, OChange 0.010 ;;; IMu 2.00, IEps 0.10, IDcy 0.0001, IPat 15, IChange 0.030 ;;; Utype :SIGMOID, Otype :SIGMOID, Pool 32 ;;; ;;; To train incrementally, do something like this: ;;; (build-morse "abcde") ;;; (train 150 150 25) ;;; (build-morse "fghij" t) ; The T here prevents re-initialization. ;;; (train 150 150 25 t) ; Ditto. ;;; ;;; THE END