;;; learning/agents/passive-adp-learner.lisp
;;; Reinforcement learning agent that uses dynamic
;;; programming to solve the Markov process
;;; that it learns from its experience. Thus, the
;;; main job is to update the model over time.
;;; Being a passive agent, it simply does no-op
;;; at each step, watching the world go by.
(defun make-passive-adp-learner ()
(let ((percepts nil)
(U (make-hash-table :test #'equal))
(N (make-hash-table :test #'equal))
(M (make-hash-table :test #'equal))
(R (make-hash-table :test #'equal)))
#'(lambda (e)
(push e percepts)
(let ((s (mdp-percept-state e)))
(unless (gethash s N) ;;; make entries for new state
(setf (gethash s N) 0
(gethash s U) 0
(gethash s M) (list
(cons 'no-op (make-mdp-action-model)))
(gethash s R) (mdp-percept-reward e)))
(incf (gethash s N))
(update-passive-model s percepts M)
(setq U (value-determination (passive-policy M) U M R))
(when (mdp-percept-terminalp e)
(setq percepts nil)))
'no-op)))
;;; Updating the transition model according to oberved transition i->j.
;;; Fairly tedious because of initializing new transition records.
(defun update-passive-model
(j ;;; current state (destination of transition)
percepts ;;; in reverse chronological order
M ;;; transition model, indexed by state
&aux transition)
(when (length>1 percepts)
(let* ((e2 (second percepts))
(i (mdp-percept-state e2)) ;;; transition from i, so update i's model
(action-model (action-model 'no-op i M))
(transitions (mdp-action-model-transitions action-model)))
(incf (mdp-action-model-times-executed action-model))
(unless (setq transition
(find j transitions :test #'equal
:key #'transition-destination))
(push (setq transition (make-transition :destination j))
(mdp-action-model-transitions action-model)))
(incf (transition-times-achieved transition))
(dolist (trans (mdp-action-model-transitions action-model))
(setf (transition-probability trans)
(float (/ (transition-times-achieved trans)
(mdp-action-model-times-executed action-model))))))))
;;; (passive-policy M) makes a policy of no-ops for use in value determination
(defun passive-policy (M)
(copy-hash-table M #'(lambda (x) (declare (ignore x))
(list (list 'no-op 1.0)))))