;;; learning/agents/passive-adp-learner.lisp ;;; Reinforcement learning agent that uses dynamic ;;; programming to solve the Markov process ;;; that it learns from its experience. Thus, the ;;; main job is to update the model over time. ;;; Being a passive agent, it simply does no-op ;;; at each step, watching the world go by. (defun make-passive-adp-learner () (let ((percepts nil) (U (make-hash-table :test #'equal)) (N (make-hash-table :test #'equal)) (M (make-hash-table :test #'equal)) (R (make-hash-table :test #'equal))) #'(lambda (e) (push e percepts) (let ((s (mdp-percept-state e))) (unless (gethash s N) ;;; make entries for new state (setf (gethash s N) 0 (gethash s U) 0 (gethash s M) (list (cons 'no-op (make-mdp-action-model))) (gethash s R) (mdp-percept-reward e))) (incf (gethash s N)) (update-passive-model s percepts M) (setq U (value-determination (passive-policy M) U M R)) (when (mdp-percept-terminalp e) (setq percepts nil))) 'no-op))) ;;; Updating the transition model according to oberved transition i->j. ;;; Fairly tedious because of initializing new transition records. (defun update-passive-model (j ;;; current state (destination of transition) percepts ;;; in reverse chronological order M ;;; transition model, indexed by state &aux transition) (when (length>1 percepts) (let* ((e2 (second percepts)) (i (mdp-percept-state e2)) ;;; transition from i, so update i's model (action-model (action-model 'no-op i M)) (transitions (mdp-action-model-transitions action-model))) (incf (mdp-action-model-times-executed action-model)) (unless (setq transition (find j transitions :test #'equal :key #'transition-destination)) (push (setq transition (make-transition :destination j)) (mdp-action-model-transitions action-model))) (incf (transition-times-achieved transition)) (dolist (trans (mdp-action-model-transitions action-model)) (setf (transition-probability trans) (float (/ (transition-times-achieved trans) (mdp-action-model-times-executed action-model)))))))) ;;; (passive-policy M) makes a policy of no-ops for use in value determination (defun passive-policy (M) (copy-hash-table M #'(lambda (x) (declare (ignore x)) (list (list 'no-op 1.0)))))