# This program implements the action-value method from Section 2.6 for solving # bandit problems. This method uses a fixed step-size parameter, alpha, and # epsilon-greedy action selection with a fixed exploration parameter, epsilon. from random import random # random() returns a random real number in [0,1) from random import randrange # randrange(n) returns a random integer in {0,1,2,...,n} from random import gauss # gauss(0,1) returns a normally distributed number # with zero mean and unit variance n = 10 # number of possible actions or arms numbandits = 200 # number of different bandits (with different true action values) Qstar = [[gauss(0,1) for i in range(n)] # array of true action values for each bandit for j in range(numbandits)] def bandit(banditnumber,action): """Do one play with the bandit; return reward""" global Qstar return Qstar[banditnumber][action] + gauss(0,1) def playbandit(banditnum=0, numplays=1000, alpha=0.1, epsilon=0.1): """Play a bandit for some number of plays; return average reward""" global n Q = [0.0 for i in range(n)] # array to hold estimated action values totalreward = 0 for t in range(numplays): if random() < epsilon: # epsilon-greedy action selection action = randrange(n) else: action = argmax(Q) reward = bandit(banditnum,action) totalreward = totalreward + reward Q[action] = Q[action] + alpha * (reward - Q[action]) # learning update return totalreward/numplays def argmax(array): """Return the index of the maximal value in the array/list, breaking ties randomly; array must have at least one element""" bestindex = 0 bestvalue = array[0] numbests = 1 for i in range(1,len(array)): value = array[i] if value < bestvalue: pass elif value > bestvalue: bestvalue = value bestindex = i numbests = 1 elif value == bestvalue: numbests += 1 if random() < 1.0/numbests: bestindex = i return bestindex def mean(list): """Return arithmetic mean (average) of a list's values""" sum = 0 for item in list: sum += item return sum/float(len(list)) def performance(alpha,epsilon): return mean([playbandit(i,1000,alpha,epsilon) for i in range(numbandits)]) print performance(0.1,0.1) # 0) Find the value of numbandits such that performance runs in about 30 seconds # 1) Repeat the performance calculation for values of alpha in [2**0, 2**-1,..., 2**-8] # (for the most reliable results, leave Qstar unchanged as you vary alpha) # 2) Briefly describe and explain the trend in performance as a function of alpha # 3) Find the best alpha in this set # 4) Using the best alpha, repeat for epsilon in [2**-2, 2**-3,..., 2**-10] # 5) Briefly describe and explain the trend in performance as a function of epsilon # 6) Why is the relative performance of the greedy method so unlike that in the book?