#! /usr/bin/env python
import sys
import re
import random

# Note that this solution ignores naturals.

# A hand is represented as a pair (total, ace) where:
#  - total is the point total of cards in the hand (counting aces as 1)
#  - ace is true iff the hand contains an ace

# Return the empty hand.
def empty_hand():
   return (0, False)

# Return whether the hand has a useable ace.
# Note that a hand may contain an ace, but it might not be useable.
def hand_has_useable_ace(hand):
   total, ace = hand
   return ((ace) and ((total + 10) <= 21))

# Return the value of the hand.
# The value of the hand is either total or total + 10 (if the ace is useable)
def hand_value(hand):
   total, ace = hand
   if (hand_has_useable_ace(hand)):
      return (total + 10)
   else:
      return total

# Update a hand by adding a card to it.
# Return the new hand.
def hand_add_card(hand, card):
   total, ace = hand
   total = total + card
   if (card == 1):
      ace = True
   return (total, ace)

# Return the reward of the game (-1, 0, or 1) given the final player and dealer
# hands.
def game_reward(player_hand, dealer_hand):
   player_val = hand_value(player_hand)
   dealer_val = hand_value(dealer_hand)
   if (player_val > 21):
      return -1.0
   elif (dealer_val > 21):
      return 1.0
   elif (player_val < dealer_val):
      return -1.0
   elif (player_val == dealer_val):
      return 0.0
   elif (player_val > dealer_val):
      return 1.0

# Draw a card from an unbiased deck.
# Return the face value of the card (1 to 10).
def draw_card_unbiased():
   card = random.randint(1,13)
   if (card > 10):
      card = 10
   return card

# Draw a card from a biased deck rich in tens and aces.
# Return the face value of the card (1 to 10).
#
# The project description was unclear about this.
# Rich in "tens" could also mean rich in J,Q,K as well, so its ok if you
# did that in your solutions.
def draw_card_biased():
   # choose ace with probability 11/130
   r = random.randint(1,130)
   if (r <= 11):
      return 1
   # choose ten with probability 11/130
   if (r <= 22):
      return 10
   # else choose 2-9, J, Q, K with equal probability
   card = random.randint(2,12)
   if (card > 10):
      card = 10
   return card

# Deal a player hand given the function to draw cards.
# Return only the hand.
def deal_player_hand(draw):
   hand = empty_hand()
   hand = hand_add_card(hand, draw())
   hand = hand_add_card(hand, draw())
   while (hand_value(hand) < 11):
      hand = hand_add_card(hand, draw())
   return hand

# Deal the first card of a dealer hand given the function to draw cards.
# Return the hand and the shown card.
def deal_dealer_hand(draw):
   hand = empty_hand()
   card = draw()
   hand = hand_add_card(hand, card)
   return hand, card

# Play the dealer hand using the fixed strategy for the dealer.
# Return the resulting dealer hand.
def play_dealer_hand(hand, draw):
   while (hand_value(hand) < 17):
      hand = hand_add_card(hand, draw())
   return hand

# States are tuples (card, val, useable) where
#  - card is the card the dealer is showing
#  - val is the current value of the player's hand
#  - useable is whether or not the player has a useable ace

# Actions are either stay (False) or hit (True)

# Select a state at random.
def select_random_state(all_states):
   n = len(all_states)
   r = random.randint(0,n-1)
   state = all_states[r]
   return state

# Select an action at random
def select_random_action():
   r = random.randint(1,2)
   return (r == 1)

# Select the best action using current Q-values.
def select_best_action(Q, state):
   if (Q[(state,True)] > Q[(state,False)]):
      return True
   else:
      return False

# Select an action according to the epsilon-greedy policy
def select_e_greedy_action(Q, state, epsilon):
   r = random.random()
   if (r < epsilon):
      return select_random_action()
   else:
      return select_best_action(Q, state)

# Given the state, return player and dealer hand consistent with it.
def hands_from_state(state):
   card, val, useable = state
   if (useable):
      val = val - 10
   player_hand = (val, useable)
   dealer_hand = empty_hand()
   dealer_hand = hand_add_card(dealer_hand, card)
   return card, dealer_hand, player_hand

# Given the dealer's card and player's hand, return the state.
def state_from_hands(card, player_hand):
   val = hand_value(player_hand)
   useable = hand_has_useable_ace(player_hand)
   return (card, val, useable)

# Return a list of the possible states.
def state_list():
   states = []
   for card in range(1,11):
      for val in range(11,22):
         states.append((card, val, False))
         states.append((card, val, True))
   return states

# Return a map of all (state, action) pairs -> values (initially zero)
def initialize_state_action_value_map():
   states = state_list()
   M = {}
   for state in states:
      M[(state,False)] = 0.0
      M[(state,True)] = 0.0
   return M

# Print a (state, action) -> value map
def print_state_action_value_map(M):
   for useable in [True, False]:
      if useable:
         print 'Usable ace'
      else:
         print 'No useable ace'
      print 'Values for staying:'
      for val in range(21,10,-1):
         for card in range(1,11):
            print '%5.2f' % M[((card,val,useable),False)], ' ',
         print '| %d' % val
      print 'Values for hitting:'
      for val in range(21,10,-1):
         for card in range(1,11):
            print '%5.2f' % M[((card,val,useable),True)], ' ',
         print '| %d' % val
      print ' '

# Print the state-action-value function (Q)
def print_Q(Q):
   print '---- Q(s,a) ----'
   print_state_action_value_map(Q)

# Print the state-value function (V) given the Q-values
def print_V(Q):
   print '---- V(s) ----'
   for useable in [True, False]:
      if useable:
         print 'Usable ace'
      else:
         print 'No useable ace'
      for val in range(21,10,-1):
         for card in range(1,11):
            if (Q[((card,val,useable),True)] > Q[((card,val,useable),False)]):
               print '%5.2f' % Q[((card,val,useable),True)], ' ',
            else:
               print '%5.2f' % Q[((card,val,useable),False)], ' ',
         print '| %d' % val
      print ' '

# Print a policy given the Q-values
def print_policy(Q):
   print '---- Policy ----'
   for useable in [True, False]:
      if useable:
         print 'Usable ace'
      else:
         print 'No useable ace'
      for val in range(21,10,-1):
         for card in range(1,11):
            if (Q[((card,val,useable),True)] > Q[((card,val,useable),False)]):
               print 'X',
            else:
               print ' ',
         print '| %d' % val
      print ' '

# Initialize Q-values so that they produce the initial policy of sticking
# only on 20 or 21.
def initialize_Q():
   states = state_list()
   M = {}
   for state in states:
      card, val, useable = state
      if (val < 20):
         M[(state,False)] = -0.001
         M[(state,True)] = 0.001   # favor hitting
      else:
         M[(state,False)] = 0.001  # favor sticking
         M[(state,True)] = -0.001
   return M

# Initialize number of times each (state,action) pair has been observed.
def initialize_count():
   count = initialize_state_action_value_map()
   return count

# Monte-Carlo ES
#
# Run Monte-Carlo ES for the specified number of iterations and return the 
# resulting Q-values.
def monte_carlo_es(draw, n_iter):
   # initialize Q and observation count
   Q = initialize_Q()
   count = initialize_count()
   # get list of all states
   all_states = state_list()
   # iterate
   for n in range(0,n_iter):
      # initialize list of (state, action) pairs encountered in episode
      sa_list = []
      # pick starting (state, action) using exploring starts
      state = select_random_state(all_states)
      action = select_random_action()
      dealer_card, dealer_hand, player_hand = hands_from_state(state)
      # update the (state, action) list 
      sa_list.append((state, action))
      # generate the episode
      while (action):
         player_hand = hand_add_card(player_hand, draw())
         if (hand_value(player_hand) > 21):
            break
         state = state_from_hands(dealer_card, player_hand)
         action = select_best_action(Q, state)
         sa_list.append((state, action))
      # allow the dealer to play
      dealer_hand = play_dealer_hand(dealer_hand, draw)
      # compute return
      R = game_reward(player_hand, dealer_hand)
      # update Q using average return
      for sa in sa_list:
         Q[sa] = (Q[sa] * count[sa] + R) / (count[sa] + 1)
         count[sa] = count[sa] + 1
   return Q

# Q-learning.
#
# Run Q-learning for the specified number of iterations and return the Q-values.
# In this implementation, alpha decreases over time.
def q_learning(draw, n_iter, alpha, epsilon):
   # initialize Q and count
   Q = initialize_Q()
   count = initialize_count()
   # get list of all states
   all_states = state_list()
   # iterate
   for n in range(0,n_iter):
      # initialize state
      state = select_random_state(all_states)
      dealer_card, dealer_hand, player_hand = hands_from_state(state)
      # choose actions, update Q
      while (True):
         action = select_e_greedy_action(Q, state, epsilon)
         sa = (state, action)
         if (action):
            # draw a card, update state
            player_hand = hand_add_card(player_hand, draw())
            # check if busted
            if (hand_value(player_hand) > 21):
               # update Q-value
               count[sa] = count[sa] + 1.0
               Q[sa] = Q[sa] + alpha/count[sa] * ((-1.0) - Q[sa])
               break
            else:
               # update Q-value
               s_next = state_from_hands(dealer_card, player_hand)
               q_best = Q[(s_next,False)]
               if (Q[(s_next,True)] > q_best):
                  q_best = Q[(s_next,True)]
               count[sa] = count[sa] + 1.0
               Q[sa] = Q[sa] + alpha/count[sa] * (q_best - Q[sa])
               # update state
               state = s_next
         else:
            # allow the dealer to play
            dealer_hand = play_dealer_hand(dealer_hand, draw)
            # compute return
            R = game_reward(player_hand, dealer_hand)
            # update Q value
            count[sa] = count[sa] + 1.0
            Q[sa] = Q[sa] + alpha/count[sa] * (R - Q[sa])
            break
   return Q

# Compute the expected value of the game using the learned Q-values.
def expected_gain(draw, Q, n_iter):
   gain = 0.0
   for n in range(0,n_iter):
      player_hand = deal_player_hand(draw)
      dealer_hand, dealer_card = deal_dealer_hand(draw)
      state = state_from_hands(dealer_card, player_hand)
      v = Q[(state,False)]
      if (Q[(state,True)] > v):
         v = Q[(state,True)]
      gain = gain + v
   print 'Expected gain: %6.3f' % (gain / float(n_iter))
   print ' '

# Main program
if __name__ == '__main__':
   # set parameters
   n_iter_mc = 10000000
   n_iter_q  = 10000000
   n_games = 100000
   alpha = 1
   epsilon = 0.1
   # run learning algorithms
   print 'MONTE CARLO ES -- UNBIASED DECK'
   Q = monte_carlo_es(draw_card_unbiased, n_iter_mc)
   print_Q(Q)
   print_V(Q)
   print_policy(Q)
   expected_gain(draw_card_unbiased, Q, n_games)
   print 'MONTE CARLO ES -- BIASED DECK'
   Q = monte_carlo_es(draw_card_biased, n_iter_q)
   print_Q(Q)
   print_V(Q)
   print_policy(Q)
   expected_gain(draw_card_biased, Q, n_games)
   print 'Q-LEARNING -- UNBIASED DECK'
   Q = q_learning(draw_card_unbiased, n_iter_q, alpha, epsilon)
   print_Q(Q)
   print_V(Q)
   print_policy(Q)
   expected_gain(draw_card_unbiased, Q, n_games)
   print 'Q-LEARNING -- BIASED DECK'
   Q = q_learning(draw_card_biased, n_iter_q, alpha, epsilon)
   print_Q(Q)
   print_V(Q)
   print_policy(Q)
   expected_gain(draw_card_biased, Q, n_games)
   exit(0)