Udacity AI for Robotics Dynamic Programming

This is the code for the left turn policy at the end of lesson 12 in Sebastien Thrun's Udacity course.

The following code was copied from the course solution and the Github repos from students who completed the course (or were copied from the video of the course solution) and are working.

I've printed out the value and guideline of the 3D matrices and can see them change each iteration in the main for loop, but surprisingly the 4 submatrices of values ​​and guidelines change identically when I thought each submatrix was for one of the 4 headings is determined / orientations that the car could face (up, down, left, right) so that there are different values ​​in each submatrix because the costs of turning left and right or driving straight ahead are different ,

Neither do I understand how the first cell next to the target becomes 20, which is the cost of the left turn, and not 1, which is the cost of driving straight ahead.

I thought the purpose of dynamic programming was to track the results of recursive calls.

If I were to rewrite this from scratch, I would probably do a BFS or DFS search and keep a minimum or maximum value for the total cost from start to finish. However, the Udacity autograder would not accept my other BFS solutions that provide the same answer to other questions. So I'm trying to learn how this works in an Udacity way.

Anyway, here is the full Udacity code with additional printing instructions so you can see what I mean. Does anyone know how this algo works?

# ----------
# User Instructions:
# 
# Implement the function optimum_policy2D below.
#
# You are given a car in grid with initial state
# init. Your task is to compute and return the car's 
# optimal path to the position specified in goal; 
# the costs for each motion are as defined in cost.
#
# There are four motion directions: up, left, down, and right.
# Increasing the index in this array corresponds to making a
# a left turn, and decreasing the index corresponds to making a 
# right turn.

forward = ((-1,  0), # go up
           ( 0, -1), # go left
           ( 1,  0), # go down
           ( 0,  1)) # go right
forward_name = ('up', 'left', 'down', 'right')

# action has 3 values: right turn, no turn, left turn
action = (-1, 0, 1)
action_name = ('R', '#', 'L')

# EXAMPLE INPUTS:
# grid format:
#     0 = navigable space
#     1 = unnavigable space 
grid = ((1, 1, 1, 0, 0, 0),
        (1, 1, 1, 0, 1, 0),
        (0, 0, 0, 0, 0, 0),
        (1, 1, 1, 0, 1, 1),
        (1, 1, 1, 0, 1, 1))

init = (4, 3, 0) # given in the form (row,col,direction)
                 # direction = 0: up
                 #             1: left
                 #             2: down
                 #             3: right

goal = (2, 0) # given in the form (row,col)

cost = (2, 1, 20) # cost has 3 values, corresponding to making 
                  # a right turn, no turn, and a left turn

# EXAMPLE OUTPUT:
# calling optimum_policy2D with the given parameters should return 
# ((' ', ' ', ' ', 'R', '#', 'R'),
#  (' ', ' ', ' ', '#', ' ', '#'),
#  ('*', '#', '#', '#', '#', 'R'),
#  (' ', ' ', ' ', '#', ' ', ' '),
#  (' ', ' ', ' ', '#', ' ', ' '))
# ----------

# ----------------------------------------
# modify code below
# ----------------------------------------

# up, left, down, right
#heading = (0, -1, 0, 1)

def optimum_policy2D(grid,init,goal,cost):

    # 4 3D grids one for each heading
    value = (((999 for row in range(len(grid(0)))) for col in range(len(grid))),
            ((999 for row in range(len(grid(0)))) for col in range(len(grid))),
            ((999 for row in range(len(grid(0)))) for col in range(len(grid))),
            ((999 for row in range(len(grid(0)))) for col in range(len(grid))))

    policy = (((' ' for row in range(len(grid(0)))) for col in range(len(grid))),
             ((' ' for row in range(len(grid(0)))) for col in range(len(grid))),
             ((' ' for row in range(len(grid(0)))) for col in range(len(grid))),
             ((' ' for row in range(len(grid(0)))) for col in range(len(grid))))

    # for printing
    policy2D = ((' ' for row in range(len(grid(0)))) for col in range(len(grid)))

    change = True
    count = 0
    while change:
        change = False

        for x in range(len(grid)):
            for y in range(len(grid(0))):
                # heading ranges 0 to 3
                for heading in range(4):
                    # check for goal
                    if x == goal(0) and y == goal(1):
                        if value(heading)(x)(y) > 0:
                            change = True
                            value(heading)(x)(y) = 0
                            policy(heading)(x)(y) = '*'
                            change = True
                    # if grid cell navigable
                    elif grid(x)(y) == 0:

                        #calc 3 ways to propagate value (right, straight or left???)
                        for i in range(3):
                            # heading in (0,1,2,3)
                            # action = (-1, 0, 1), action_name = ('R', '#', 'L')
                            h2 = (heading + action(i)) % 4
                            x2 = x + forward(h2)(0)
                            y2 = y + forward(h2)(1)

                            if x2 >= 0 and x2 < len(grid) and y2 >= 0 and y2 < len(grid(0)) and grid(x2)(y2) ==0:
                                # add cost associated with R turn, no turn, L turn
                                v2 = value(h2)(x2)(y2) + cost(i)
                                if v2 < value(heading)(x)(y):
                                    value(heading)(x)(y) = v2
                                    policy(heading)(x)(y) = action_name(i)
                                    change = True

            count += 1
            print("iter {}".format(count))

            print('valuen ')
            for i in range(len(value)):
                for j in range(len(value(0))):
                    print(value(0)(j))
                print('nextn')

            print('policyn')
            for i in range(len(policy)):
                for j in range(len(policy(0))):
                    print(policy(0)(j))
                print('nextn')

    # after value function, populate the 2D map
    x = init(0)
    y = init(1)
    heading = init(2)

    policy2D(x)(y) = policy(heading)(x)(y) 

    while policy(heading)(x)(y) != '*':
        if policy(heading)(x)(y) == '#':
            h2 = heading
        elif policy(heading)(x)(y) == 'R':
            h2 = (heading - 1) % 4
        elif policy(heading)(x)(y) == 'L':
            h2 = (heading + 1) % 4 
        x = x + forward(h2)(0)
        y = y + forward(h2)(1)
        heading = h2
        policy2D(x)(y) = policy(heading)(x)(y)



    print('policy2D')
    for i in range(len(policy2D)):
        print(policy2D(i))
    #return policy2D

optimum_policy2D(grid, init, goal, cost)
```