Hack machine language assembler as required for project 6 of Nand2Tetris

This is the implementation of the Assembler required to parse source code written in the Hack Machine Language and output it to a 16-bit binary file.

After writing one go in Swift, I decided I wanted to give it a go writing it in C as a challenge to myself to become more comfortable with the language as well as taking the opportunity to write my own hash table.

I’d love any feedback on any aspect of the code but I’m particularly looking for areas of improvement with regards to pointers and memory management. I don’t believe Valgrind is supported on M1 Mac machines, therefore the memory allocation and deallocation was written somewhat blind.

I’d also appreciate any pointers (no pun intended) on how I can improve adherence to C conventions.

Assembler.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include "error/Error.h"
#include "components/Stripper.h"
#include "components/Parser.h"

#define MAX_ASSEMBLED_SIZE 1048576

static void assemble(const char *file, char *output_name);

// Entry point for program
int main(int argc, char** argv) {
    
    switch (argc) {
        case 2:
            // No file name specified - revert to "assembled.hack"
            assemble(argv(1), NULL);
            break;
        case 3:
            // Use client-specified file name
            assemble(argv(1), argv(2));
            break;
        default:
            // ❌
            exit_with_error(INCORRECT_ARGUMENT_COUNT);      
    }
    return 0;

}

// Assembles a file into binary code
static void assemble(const char *file_name, char *output_name) {

    long size_in_bytes;
    char* file_to_assemble;
    printf("Starting assembly of file %sn", file_name);
    
    // Check if the file exists
    FILE *file = fopen(file_name, "r");
    if (file == NULL) {
        exit_with_error(FILE_NOT_FOUND);
    }
    else {

        // Create a HashMap to store variables and label definitions
        HashMap* hash_map = hash_map_create();

        // Retrieve the size of the file (max 500k)
        fseek(file, 0, SEEK_END);
        size_in_bytes = ftell(file);

        if (size_in_bytes > MAX_ASSEMBLED_SIZE / 2) {
            exit_with_error(FILE_TOO_LARGE);
        }
        
        fseek(file, 0, SEEK_SET); 
        file_to_assemble = malloc(size_in_bytes + 1);

        if (file_to_assemble) {
            
            // Read the contents of the file into the buffer
            fread(file_to_assemble, 1, size_in_bytes, file);
            char no_comments(size_in_bytes + 1);
            char no_spaces(size_in_bytes + 1);
            char no_labels(size_in_bytes + 1);
            char parsed(MAX_ASSEMBLED_SIZE + 1);
            
            // Remove comments and blank spaces. Save and remove labels, save variables
            strip_comments(no_comments, file_to_assemble);
            strip_spaces(no_spaces, no_comments);
            strip_labels(no_labels, no_spaces, hash_map);
            save_variables(no_labels, hash_map);

            // Translate the remaining assembly code to binary
            parse(parsed, no_labels, hash_map);

            char output_file_name(256);

            // If the client chose a custom output name
            if (output_name != NULL) {
                strcpy(output_file_name, output_name);
            }
            else {
                strcpy(output_file_name, "assembled.hack");
            }
            
            // Write the file
            FILE *output = fopen(output_file_name, "w");
            fwrite(parsed, 1, strlen(parsed), output);
            fclose(file);
            hash_map_free(hash_map);
            printf("Assembly complete");
        }
        else {
            // Could not open file. Bail
            exit_with_error(FILE_READ_ERROR);
        }
    }
}

Stripper.c

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdbool.h>
#include <ctype.h>
#include "HashMap.h"
#include "Parser.h"

#define SCREEN_ADDRESS 16384
#define KBD_ADDRESS 24576
#define SP_ADDRESS 0
#define LCL_ADDRESS 1
#define ARG_ADDRESS 2
#define THIS_ADDRESS 3
#define THAT_ADDRESS 4

// Removes extraneous white space and blank lines
void strip_spaces (char* dst, const char* src) {

    bool have_reached_printable_char = false;
    int count = 0;
    int length = strlen(src);

    while(*src != '') {
        if (count == length - 1 && *src == 'n')
            break;
        have_reached_printable_char = have_reached_printable_char ? true : isprint(*src);
        if(have_reached_printable_char && (*src == 'n' || !isspace(*src))) {
            *dst = *src; // then copy
            dst++;
        }
        count++;
        src++;
    }

    *dst = '';
}

// Remove comments (like this!) from file
void strip_comments(char* dst, const char* src) {

    bool copy = true;

    for (int i = 0; i < strlen(src); i++) {
        if (src(i) == 'n')
            copy = true;
        if (src(i) == '/' && src(i+1) == '/')
            copy = false;
        if (copy) {
            *dst = src(i);
            dst++;
        }
    }
    *dst = '';
}

// Map particular variables to corresponding address
static void map_reserved_variables(HashMap* hash_map) {
    
    hash_map_put(hash_map, "screen", SCREEN_ADDRESS);
    hash_map_put(hash_map, "kbd", KBD_ADDRESS);
    hash_map_put(hash_map, "sp", SP_ADDRESS);
    hash_map_put(hash_map, "lcl", LCL_ADDRESS);
    hash_map_put(hash_map, "arg", ARG_ADDRESS);
    hash_map_put(hash_map, "this", THIS_ADDRESS);
    hash_map_put(hash_map, "that", THAT_ADDRESS);

    for (int i = 0; i < 16; i++) {
        int length = i < 10 ? 2 : 3;
        char reg(length + 1);
        reg(0) = 'r';
        sprintf(reg + 1, "%d", i);
        hash_map_put(hash_map, reg, i);
    }
}

    

// Remove label definitions and replace them with corresponding line number of
// logic following the definition
void strip_labels(char* dst, const char* src, HashMap* hash_map) {

    map_reserved_variables(hash_map);

    int current_command = 0;
    bool save_command = false;
    bool new_command = true;
    bool copy = true;
    char current_label(256);
    int current_label_index = 0;
    char last_copied;

    while(*src != '') {
        if (*src == 'n') {
            new_command = true;
            if (last_copied == 'n') {
                src++;
            }
            current_command++;
            copy = true;
        }

        if (*src == ')' && save_command) {  
            save_command = false;
            current_label(current_label_index) = '';
            // Move backwards to go back to the command we were dealing with
            current_command--;
            for (int i = 0; i <= strlen(current_label); i++) {
                char lowered = tolower(current_label(i));
                current_label(i) = lowered;
            }
            // Now move forward one line and save whatever command number that is
            hash_map_put(hash_map, current_label, current_command+1);
            current_label_index = 0;
        }
        if (save_command) {
            current_label(current_label_index++) = *src;
        }
        if (new_command && *src == '(') {
            save_command = true;
            copy = false;
        }       
        if (copy) {
            *dst = *src;
            dst++;
            last_copied = *src;
        }
        src++;
    }
        *dst = '';
}

// Save any user declared variables
void save_variables(char* dst, HashMap* hash_map) {
    bool is_a_variable_declaration = false;
    char current_variable(256);
    int current_variable_index = 0;
    int current_variable_address = 16;

    while(*dst != '') {

        if (*dst == 'n') {
            if (is_a_variable_declaration) {
                is_a_variable_declaration = false;
                current_variable(current_variable_index) = '';
                current_variable_index = 0;

                if (!is_integral_string(current_variable)) {
                    if (hash_map_contains(hash_map, current_variable)) {
                        // It's a label declaration that we've already saved in the method above
                        continue;
                    }
                    else {
                        hash_map_put(hash_map, current_variable, current_variable_address++);
                    }
                }
            }
        }
        if (is_a_variable_declaration) {
            current_variable(current_variable_index++) = tolower(*dst);
        }
        if (*dst == '@') {
            char next = tolower(*(++dst));
            if (next != 'r' && next != '0') {
                is_a_variable_declaration = true;
            }
            dst--;
        }
        dst++;
    }
}

Parser.c

#include <stdbool.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "HashMap.h"
#include "Parser.h"
#include "../error/Error.h"

#define VALID_DESTINATION_COUNT 7
#define VALID_COMPUTATION_COUNT 28
#define VALID_JUMP_COUNT 7
#define WORD_LENGTH 16

#define A_START 0

#define C_A_OR_M_START 3
#define C_A_OR_M_BIT_LENGTH 1

#define C_DEFAULTS_START 0
#define C_DEFAULTS_BIT_LENGTH 3
#define C_DEFAULTS_ADDRESS 7

#define C_COMP_START 4
#define C_COMP_BIT_LENGTH 6

#define C_JUMP_START 13
#define C_JUMP_BIT_LENGTH 3

#define C_DEST_BIT_LENGTH 3
#define C_DEST_START 10

// Declarations

static bool is_a_command(const char* str); 

static int jump_address(const char* str);
static int destination_address(const char* str);
static int get_computation_address(const char* cmd);
static int get_jump_address(const char* cmd, int jump_operation_position); 
static int get_destination_address(const char* cmd, int assignment_operation_position);

static void parse_c_command(char* dst, const char* cmd);
static void parse_command(char* dst, const char *cmd, HashMap* hash_map); 
static void parse_a_command(char* dst, const char* cmd, HashMap* hash_map); 
static void to_bin(char* cmd, int address, int number_of_places, int starting_position); 
static void get_positions(const char* cmd, int* assignment, int* computation, int *termination, int* jump); 


// *********** CONSTANTS ************** //

// Allowed destinations in C instruction
const char* valid_destinations(VALID_DESTINATION_COUNT) = { "M", "D", "MD", "A", "AM", "AD", "AMD" };
// Allowed computations in C instruction
const char* valid_computations(VALID_COMPUTATION_COUNT) = { "0", "1", "-1", "D", "A", "!D", "!A", "-D", 
                                                            "-A", "D+1", "A+1", "D-1", "A-1", "D+A", "D-A", 
                                                            "A-D", "D&A", "D|A", "M", "!M", "-M", "M+1", 
                                                            "M-1", "D+M", "D-M", "M-D", "D&M", "D|M" };
// Denary representation of a valid computation. 
// For example the instruction D|M (see above) corresponds to 21 -> (101001)
const int valid_computation_integers(VALID_COMPUTATION_COUNT) = { 42, 63, 58, 12, 48, 13, 49, 15, 
                                                                    51, 31, 55, 14, 50, 2, 19,
                                                                    7, 0, 21, 48, 49, 51, 
                                                                    55, 50, 2, 19, 7, 0, 21 }; 
// Allowed jump commands in C instruction
const char* valid_jumps(VALID_JUMP_COUNT) = { "JGT", "JEQ", "JGE", "JLT", "JNE", "JLE", "JMP" };


// *********** PARSING ************** //

// Parse a source asm file that has been stripped of whitespace and comments
void parse(char* dst, char* src, HashMap* hash_map) {

    char current_label(256);
    int current_label_position = 0;
    int dest_position = 0;

    for (int i = 0; i <= strlen(src); i++) {
    
        if (src(i) == 'n' || src(i) == '') {
            // We've either reached a line break or EOF
            current_label(current_label_position) = '';
            char parsed(WORD_LENGTH + 1);
            to_bin(parsed, 0, WORD_LENGTH, 0);
            parsed(WORD_LENGTH) = '';
            // Parse the current command
            parse_command(parsed, current_label, hash_map);
            for (int j = 0; j < strlen(parsed); j++) {
                // Add the newly parsed command to the output file
                dst(dest_position++) = parsed(j);   
            }
            // Reset label posiion and add n or  to output
            current_label_position = 0;
            dst(dest_position++) = src(i);
            continue;
        }
        current_label(current_label_position++) = src(i);
    }
    // Done, make sure to end with null terminator
    dst(dest_position) = '';
}

static void parse_command(char* dst, const char *cmd, HashMap* hash_map) {

    // Can either be A command ("@command") or C command ("D=D+1;JGE")
    if (is_a_command(cmd)) {
        parse_a_command(dst, cmd, hash_map);
    }
    else {
        parse_c_command(dst, cmd);
    }
}

static void parse_c_command(char* dst, const char* cmd) {

    int assignment_operation_position = 0;
    int computation_operation_position = 0;
    int jump_operation_position = 0;
    int computation_termination_position = 0;

    // Fill in parts of parsed command that are common to all C instructions
    to_bin(dst, C_DEFAULTS_ADDRESS, C_DEFAULTS_BIT_LENGTH, C_DEFAULTS_START);
    get_positions(cmd, &assignment_operation_position, &computation_operation_position, &computation_termination_position, &jump_operation_position);

    // Split the command into destination (if applicable), computation and jump (if applicable)

    if (assignment_operation_position != 0) {
        int address = get_destination_address(cmd, assignment_operation_position);
        if (address == -1) {
            exit_with_error(MALFORMED_DESTINATION);
        }
        else { 
            to_bin(dst, address, C_DEST_BIT_LENGTH, C_DEST_START);
        }
    }

    int computation_string_length = computation_termination_position - computation_operation_position;
    char computation_string(computation_string_length + 1);
    strncpy(computation_string, cmd+computation_operation_position, computation_string_length);
    computation_string(computation_string_length) = '';
    int computation_address = get_computation_address(computation_string);

    if (computation_address == -1) {
        exit_with_error(MALFORMED_COMPUTATION);
    }
    else {
        to_bin(dst, computation_address, C_COMP_BIT_LENGTH, C_COMP_START);
        for (int i = computation_operation_position; i < computation_termination_position; i++) {
            if (tolower(cmd(i)) == 'm') {
                to_bin(dst, 1, C_A_OR_M_BIT_LENGTH, C_A_OR_M_START);
            }
        }
    }
    
    if (jump_operation_position != 0) {
        int address = get_jump_address(cmd, jump_operation_position);
        if (address == -1) {
            exit_with_error(MALFORMED_JUMP);
        }
        else {
            to_bin(dst, address, C_JUMP_BIT_LENGTH, C_JUMP_START);
        }
    }
}

static void parse_a_command(char* dst, const char* cmd, HashMap* hash_map) {

    size_t cmd_length = strlen(cmd);
    char lowered(cmd_length);
    int index = 0;

    // Disregard '@' by starting at index 1
    for (size_t i = 1; i <= cmd_length; i++) {
        lowered(index++) = tolower(cmd(i));
    }
    if (!is_integral_string(lowered)) {
        // It's a user-declared variable
        int value = hash_map_get(hash_map, lowered);
        to_bin(dst, value, WORD_LENGTH, A_START);
    }
    else {
        // It's a direct address (eg '@42')
        char address_string(cmd_length);
        strncpy(address_string, lowered, cmd_length);
        int address = atoi(address_string);
        to_bin(dst, address, WORD_LENGTH, A_START);
    }   
}


// *********** ADDRESSES ************** //

// Retreive generic address from array
static int get_address(const char* str, const char** argv, unsigned int count) {

    for (int i = 0; i < count; i++) {
        if (strcmp(str, argv(i)) == 0) {
            return i + 1;
        }
    }
    return -1;
}

// Retrieve destination address from C instruction
static int get_destination_address(const char* cmd, int assignment_operation_position) {

    char destination(assignment_operation_position + 1);
    strncpy(destination, cmd, assignment_operation_position);
    destination(assignment_operation_position) = '';
    return destination_address(destination);
}

static int destination_address(const char* str) {
    return get_address(str, valid_destinations, VALID_DESTINATION_COUNT);
}

// Retrieve computation address from C instruction
static int get_computation_address(const char* cmd) {

    for (int i = 0; i < VALID_COMPUTATION_COUNT; i++) {
        if (strcmp(valid_computations(i), cmd) == 0) {
            return valid_computation_integers(i);
        }
    }
    return -1;
}

// Retrieve jump address from C instruction
static int get_jump_address(const char* cmd, int jump_operation_position) {

    int jump_operation_length = strlen(cmd) - jump_operation_position;
    char jump_operation(jump_operation_length + 1);
    strncpy(jump_operation, cmd+jump_operation_position, jump_operation_length);
    jump_operation(jump_operation_length) = '';
    return jump_address(jump_operation);

} 

static int jump_address(const char* str) {
   return get_address(str, valid_jumps, VALID_JUMP_COUNT);
}


// *********** HELPER ************** //

static bool is_a_command(const char* str) {
    return str(0) == '@' && strlen(str) > 1;
}

// Is the command pure integers?
bool is_integral_string(char *str) {

    size_t length = strlen(str);
    for (int i = 0; i < length; i++) {
        if (!isdigit(str(i))) {
            return false;
        }
    }
    return true;
}

// Retrieve positions of assignment, computation, 
// and jump instructions as well as the point of termination of a computation
static void get_positions(const char* cmd, int* assignment, int* computation, int *termination, int* jump) {

    size_t length = strlen(cmd);
    for (int i = 0; i < length; i++) {
        if (cmd(i) == '=') {
            *assignment = i;
            *computation = i+1;
        }
        if (cmd(i) == ';') {
            *jump = i+1;
            *termination = i;
        }
    }
    if (*termination == 0) {
        *termination = length;
    }
}

// Convert an address to binary padding with number of places, 
// starting (0 being the 'end' of the string) at starting_position
static void to_bin(char* cmd, int address, int number_of_places, int starting_position) {

    int quo = address;
    int index = number_of_places + starting_position - 1;

    int rem;
    for (int i = 0; i < number_of_places; i++) {
        rem = quo % 2;
        cmd(index--) = rem == 1 ? '1' : '0';
        quo = quo / 2;
    }
}

HashMap.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "HashMap.h"

// Hashing function, courtesy of
// http://www.cse.yorku.ca/~oz/hash.html
static int hashed(char* arg) {

    unsigned long hash = 5381;
    int c;
    while ((c = *arg++))
        hash = ((hash << 5) + hash) + c; 
    return hash % NUM_BUCKETS;
}

// Create a HashMap (caller to free) 
HashMap* hash_map_create() {
    HashMap *hash_map = malloc(sizeof(HashMap));
    for (int i = 0; i < NUM_BUCKETS; i++)
        hash_map->buckets(i) = NULL;
    return hash_map;
}

// Insert into HashMap (assumes non-negative value)
void hash_map_put(HashMap* hash_map, char* key, int value) {

    int hashed_key = hashed(key);
    int result = hash_map_get(hash_map, key);

    if (result != -1) {
        // Overwrite old value
        Node *current = hash_map->buckets(hashed_key);
        while (current->key != key) {
            current = current->next;
        }
        current->value = value;
    }
    else {
        if (hash_map->buckets(hashed_key) == NULL) {
            // Insert at first node in bucket
            hash_map->buckets(hashed_key) = malloc(sizeof(Node));
            hash_map->buckets(hashed_key)->key =  malloc(sizeof(char) * (strlen(key) + 1));
            strcpy(hash_map->buckets(hashed_key)->key, key);
            hash_map->buckets(hashed_key)->value = value;
            hash_map->buckets(hashed_key)->next = NULL;
        }
        else {
            // Collision, traverse to end of linked list
            Node *current = hash_map->buckets(hashed_key);
            Node *new = malloc(sizeof(Node));
            new->key = malloc(sizeof(char) * (strlen(key) + 1));
            strcpy(new->key, key);
            new->value = value;
            new->next = NULL;
            while (current->next != NULL) {
                current = current->next;
            }
            current->next = new;
        }
    }
}

bool hash_map_contains(HashMap* hash_map, char* key) {
    return hash_map_get(hash_map, key) != -1;
}

// Retrieve value for key from HashMap
int hash_map_get(HashMap* hash_map, char* key) {

    int hashed_key = hashed(key);
    Node *current = hash_map->buckets(hashed_key);
    int returned = -1;

    while (current != NULL) {
        if (strcmp(current->key, key) == 0) {
            returned = current->value;
            break;
        }
        current = current->next;
    }
    return returned;
}

// Remove value for key in HashMap
void hash_map_remove(HashMap* hash_map, char* key) {

    int hashed_key = hashed(key);
    if (hash_map_get(hash_map, key) != -1) {
        Node *current = hash_map->buckets(hashed_key);
        if (current->key == key) {
            // Remove from head node
            Node *new_current = current->next;
            current = NULL;
            hash_map->buckets(hashed_key) = new_current;
        }
        else {
            // Traverse and remove when found
            while (current->next->key != key) {
                current = current->next;
            }
            Node *old_next = current->next;
            Node *new_next = current->next->next;
            current->next = new_next;
            old_next = NULL;
        }
    }
}

// Free HashMap
void hash_map_free(HashMap* hash_map) {
    free(hash_map);
}

Error.c

#include <stdio.h>
#include <stdlib.h>
#include "Error.h"

void error_string_for_code(int code, char **star);

void exit_with_error(int code) {
    char *str = "";
    error_string_for_code(code, &str);
    printf("Exiting with error: '%s'n", str);
    exit(code);
}

void error_string_for_code(int code, char **str) {
    
    switch (code) {
        case INCORRECT_ARGUMENT_COUNT:
            *str = "Incorrect argument count";
            break;
        case FILE_NOT_FOUND:
            *str = "The file doesn't exist";
            break;
        case FILE_READ_ERROR:
            *str = "Could not read file";
            break;
        case NULL_POINTER:
            *str = "Null pointer found";
            break; 
        case UNKNOWN_COMMAND:
            *str = "Unrecognized command";
            break;
        case MALFORMED_DESTINATION:
            *str = "Invalid destination received";
            break;
        case MALFORMED_COMPUTATION:
            *str = "Received a missing computation";
            break;
        case MALFORMED_JUMP:
            *str = "Invalid jump received";
            break;
        case FILE_TOO_LARGE:
            *str = "File too large, please ensure it is less than 500kb";
            break;
        default:
            *str = "Unknown error.";
        break;
    }
}
```