;;;; Code for performance assessment of DP and RL algorithms. ;;; Makes extensive use of global variables to minimize interference with the ;;; algorithms themselves. (defvar *policy-fn*) ;;; the policy used by the agent in acting (defvar *correct-U*) (defvar *correct-M*) (defvar *correct-R*) ;;;; U2 is the correct utility table ;;;; assume U1, U2 have the same states (defun u-rms-error (U1 U2 &aux (n 0) (e 0)) (maphash #'(lambda (s u) (incf n) (incf e (square (- u (gethash s U2))))) U1) (sqrt (/ e n))) ;;; The policy loss of a utility function U for an mdp is defined as the ;;; difference in utility between the corresponding policy and the optimal ;;; policy, for the agent's current state. Calculate using ;;; value determination wrt the current policy (defun loss (mdp U &aux (U2 (copy-hash-table U #'identity)) (M (mdp-model mdp)) (R (mdp-rewards mdp))) (maphash #'(lambda (s md) (declare (ignore md)) (unless (gethash s U2) (setf (gethash s U2) 0))) *correct-R*) ;;; fill in missing entries if any (setq U2 (value-determination (funcall *policy-fn* U M R) U2 *correct-M* *correct-R*)) (- (gethash (mdp-initial-state mdp) *correct-U*) (gethash (mdp-initial-state mdp) U2)))