;;; learning/agents/exploring-adp-learner.lisp
;;; Reinforcement learning agent that uses dynamic
;;; programming to solve the Markov decision process
;;; that it learns from its experience. Thus, the
;;; main job is to update the model over time.
;;; Unlike the active-adp-learner, this agent uses
;;; an "intelligent" exploration policy to make sure it
;;; explores the state space reasonably quickly.
(defvar *R+* 2)
(defvar *Ne* 5)
(defun exploration-function (u n)
(if (< n *Ne*) *R+* u))
(defun make-exploring-adp-learner (actions)
(let* ((percepts nil)
(last-action nil)
(U (make-hash-table :test #'equal))
(N (make-hash-table :test #'equal))
(M (make-hash-table :test #'equal))
(R (make-hash-table :test #'equal))
(mdp (make-mdp :model M :rewards R)))
#'(lambda (e)
(push e percepts)
(let ((s (mdp-percept-state e)))
(unless (gethash s N) ;;; make entries for new state
(setf (gethash s N) 0
(gethash s U) 0
(gethash s M) (mapcar
#'(lambda (a)
(cons a (make-mdp-action-model)))
actions)
(gethash s R) (mdp-percept-reward e)))
(incf (gethash s N))
(update-active-model mdp percepts last-action)
(when (mdp-terminal-states mdp)
(setq U (exploratory-value-iteration mdp U)))
(when (mdp-percept-terminalp e)
(setq percepts nil))
(setq last-action (exploration-choice s U M R))))))
;;; Given an environment model M, determine the values of states U.
;;; Use value iteration, with initial values given by U itself.
;;; Basic equation is U(i) <- r(i) + max_a f(sum_j M(a,i,j)U(j), N(a,i))
;;; where f is the exploration function. Does not applyt to terminal states.
(defun exploratory-value-iteration
(mdp &optional (Uold (copy-hash-table (mdp-rewards mdp) #'identity))
&key (epsilon 0.000001)
&aux (Unew (copy-hash-table Uold #'identity))
(max-delta infinity)
(M (mdp-model mdp))
(R (mdp-rewards mdp)))
(do ()
((< max-delta epsilon) Unew)
(setq max-delta 0)
(rotatef Uold Unew) ;;; switch contents; then we will overwrite Unew
(maphash
#'(lambda (s u)
(unless (sink? s M)
(setf (gethash s Unew)
(+ (gethash s R)
(if (gethash s M)
(apply #'max
(mapcar
#'(lambda (a)
(if (member s (mdp-terminal-states mdp)
:test #'equal)
(gethash s R)
(exploration-function
(q-value a s Uold M R)
(mdp-action-model-times-executed
(action-model a s M)))))
(actions s M)))
0))))
(setq max-delta (max max-delta (abs (- (gethash s Unew) u)))))
Uold)))
(defun exploration-choice (s U M R)
(the-biggest-random-tie
#'(lambda (a)
(exploration-function
(q-value a s U M R)
(mdp-action-model-times-executed (action-model a s M))))
(actions s M)))