Source code for axelrod.strategies.qlearner
from collections import OrderedDict
from typing import Dict, Union
from axelrod.action import Action, actions_to_str
from axelrod.player import Player
Score = Union[int, float]
C, D = Action.C, Action.D
[docs]
class RiskyQLearner(Player):
"""A player who learns the best strategies through the q-learning
algorithm.
This Q learner is quick to come to conclusions and doesn't care about the
future.
Names:
- Risky Q Learner: Original name by Geraint Palmer
"""
name = "Risky QLearner"
classifier = {
"memory_depth": float("inf"), # Long memory
"stochastic": True,
"long_run_time": False,
"inspects_source": False,
"manipulates_source": False,
"manipulates_state": False,
}
learning_rate = 0.9
discount_rate = 0.9
action_selection_parameter = 0.1
memory_length = 12
def __init__(self) -> None:
"""Initialises the player by picking a random strategy."""
super().__init__()
# Set this explicitly, since the constructor of super will not pick it up
# for any subclasses that do not override methods using random calls.
self.classifier["stochastic"] = True
self.prev_action = None # type: Action
self.original_prev_action = None # type: Action
self.score = 0
self.Qs = OrderedDict({"": OrderedDict(zip([C, D], [0, 0]))})
self.Vs = OrderedDict({"": 0})
self.prev_state = ""
def receive_match_attributes(self):
(R, P, S, T) = self.match_attributes["game"].RPST()
self.payoff_matrix = {C: {C: R, D: S}, D: {C: T, D: P}}
[docs]
def strategy(self, opponent: Player) -> Action:
"""Runs a qlearn algorithm while the tournament is running."""
if len(self.history) == 0:
self.prev_action = self._random.random_choice()
self.original_prev_action = self.prev_action
state = self.find_state(opponent)
reward = self.find_reward(opponent)
if state not in self.Qs:
self.Qs[state] = OrderedDict(zip([C, D], [0, 0]))
self.Vs[state] = 0
self.perform_q_learning(
self.prev_state, state, self.prev_action, reward
)
action = self.select_action(state)
self.prev_state = state
self.prev_action = action
return action
[docs]
def select_action(self, state: str) -> Action:
"""
Selects the action based on the epsilon-soft policy
"""
rnd_num = self._random.random()
p = 1.0 - self.action_selection_parameter
if rnd_num < p:
return max(self.Qs[state], key=lambda x: self.Qs[state][x])
return self._random.random_choice()
[docs]
def find_state(self, opponent: Player) -> str:
"""
Finds the my_state (the opponents last n moves +
its previous proportion of playing C) as a hashable state
"""
prob = "{:.1f}".format(opponent.cooperations)
action_str = actions_to_str(opponent.history[-self.memory_length :])
return action_str + prob
[docs]
def find_reward(
self, opponent: Player
) -> Dict[Action, Dict[Action, Score]]:
"""
Finds the reward gained on the last iteration
"""
if len(opponent.history) == 0:
opp_prev_action = self._random.random_choice()
else:
opp_prev_action = opponent.history[-1]
return self.payoff_matrix[self.prev_action][opp_prev_action]
[docs]
class ArrogantQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning
algorithm.
This Q learner jumps to quick conclusions and cares about the future.
Names:
- Arrogant Q Learner: Original name by Geraint Palmer
"""
name = "Arrogant QLearner"
learning_rate = 0.9
discount_rate = 0.1
[docs]
class HesitantQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning algorithm.
This Q learner is slower to come to conclusions and does not look ahead much.
Names:
- Hesitant Q Learner: Original name by Geraint Palmer
"""
name = "Hesitant QLearner"
learning_rate = 0.1
discount_rate = 0.9
[docs]
class CautiousQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning algorithm.
This Q learner is slower to come to conclusions and wants to look ahead
more.
Names:
- Cautious Q Learner: Original name by Geraint Palmer
"""
name = "Cautious QLearner"
learning_rate = 0.1
discount_rate = 0.1