c++ – K-clustering algorithm using Kruskal MST with Disjoint Set in place to check for cycles


here below a working implementation that finds the minimal distance between k(set =4 below) clusters in a graph.
I have doubts mainly on the implementation of the Disjoint Set structure: instead of using STL containers, I went along with an array of pointers as data structure hosting the leader nodes, in order to have some practice with C-style pointers.

Here is the Graph structure:
Graph.h

// creates and manages a graph data structure
#ifndef GRAPH_H_INCLUDED
#define GRAPH_H_INCLUDED

#include <array>
#include <vector>
#include <list>
#include <fstream>
#include <string>
#include <iostream>

class Edge
{
public:
    // constructors
    Edge();
    Edge(int, int, int);

    std::array<int, 2> getNodes() const;
    void setNodes(const int, const int);
    int getCost() const;
    void setCost(const int);
    void swapNodes();
    // a get() that allows writing:
    int operator()(const int);

    bool operator==(const Edge&) const;
    bool operator!=(const Edge&) const;

private:
    int node1, node2;       // nodes are just indices of the nodes in the graph
    int cost;
};


class Node
{
    friend class Graph;       // friendship needed by printing routine in Graph
public:
    // constructors
    Node();
    Node(const Edge&);

    int getLabel() const {return label;}
    Edge getEdge(const int) const;
    void setLabel(const int in) {label = in;}
    void addEdge(const Edge in) {edges.push_back(in);}
    void addManyEdges(const std::vector<Edge>);
    int getScore() const {return score;}
    void setScore(const int in) {score = in;}

    std::list<Edge>::size_type size() const {return edges.size();}  // size of list 'edges'

    // iterators
    typedef std::list<Edge>::iterator iterator;
    typedef std::list<Edge>::const_iterator const_iterator;
    std::list<Edge>::iterator begin() {return edges.begin();}
    std::list<Edge>::iterator end() {return edges.end();}
    std::list<Edge>::const_iterator cbegin() const {return edges.begin();}
    std::list<Edge>::const_iterator begin() const {return edges.begin();}
    std::list<Edge>::const_iterator cend() const {return edges.end();}
    std::list<Edge>::const_iterator end() const {return edges.end();}

    // inserts group of edges to the end of a edges vector
    std::list<Edge>::iterator insertEdges(const std::list<Edge>::iterator beg_in,
                                               const std::list<Edge>::iterator end_in)
    {
        return edges.insert(end(), beg_in, end_in);
    }
    // erase node
    std::list<Edge>::iterator erase(int);

    bool operator==(const Node&) const;
    bool operator!=(const Node&) const;

private:
    int label;
    std::list<Edge> edges;
    int score;          // new, starts at 10000000, is equal to lowest cost Edge in 'edges'
};


class Graph
{
public:
    Graph();
    // constructor from txt file
    Graph(const std::string&);

    Node getNode(const int index) const {return nodes(index);}
    void addNode(const Node in) {nodes.push_back(in);}

    std::vector<Node>::size_type size() {return nodes.size();}  // size of vector 'nodes'
    std::vector<Node>::size_type size() const {return nodes.size();}  // size of vector 'nodes'
    void output() const;      // prints graph
    void output(const int) const;

    // iterators
    typedef std::vector<Node>::iterator iterator;
    typedef std::vector<Node>::const_iterator const_iterator;
    std::vector<Node>::iterator begin() {return nodes.begin();}
    std::vector<Node>::iterator end() {return nodes.end();}
    std::vector<Node>::const_iterator begin() const {return nodes.begin();}
    std::vector<Node>::const_iterator end() const {return nodes.end();}

    Node& operator()(const int index)
    {
        return nodes(index);
    }

    std::vector<Node>::iterator erase(const int index)
    {
        return nodes.erase(nodes.begin() + index);
    }

private:
    std::vector<Node> nodes;
};

bool compareCosts(const Edge&, const Edge&);
bool compareScores(const Node&, const Node&);
bool compareLabels(const Node&, const Node&);

#endif // GRAPH_H_INCLUDED

Graph.cpp:

#include <iostream>
#include <fstream>
#include <array>
#include <list>
#include <string>
#include <algorithm>
#include "Graph.h"

using std::array;       using std::ifstream;
using std::string;      using std::endl;
using std::cout;        using std::list;
using std::equal;

// Edge

// constructors
Edge::Edge():node1(0), node2(0), cost(0) {}
Edge::Edge(int node_1, int node_2, int len): node1(node_1), node2(node_2), cost(len) {}


array<int, 2> Edge::getNodes() const
{
    array<int, 2> ret = {node1, node2};

    return ret;
}


void Edge::setNodes(const int in1, const int in2)
{
    node1 = in1;
    node2 = in2;
}


int Edge::getCost() const
{
    return cost;
}


void Edge::setCost(const int len)
{
    cost = len;
}


void Edge::swapNodes()
{
    node1 = node1 - node2;
    node2 += node1;
    node1 = node2 - node1;
}


// same as getNodes() above
int Edge::operator()(const int index)
{
    if (index == 0) return node1;
    else if (index == 1) return node2;
    else {
        try {throw;}
        catch(...) {cout << "edge index must be either 0 or 1" << endl;}
        return 1;
    }
}


bool Edge::operator==(const Edge& rhs) const
{
    if ( (node1 == rhs.getNodes()(0)) && (node2 == rhs.getNodes()(1)) && cost == rhs.getCost() )
        return true;
    else 
        return false;
}


bool Edge::operator!=(const Edge& rhs) const
{
    if ( !(*this == rhs) ) 
        return true;
    else 
        return false;
}


// Node

//constructors
Node::Node(): label(0), edges(0), score(10000000) {}
Node::Node(const Edge& edg): label(edg.getNodes()(0)), edges(0), score(10000000)
{
    edges.push_back(edg);
}


Edge Node::getEdge(const int index) const
{
    Edge ret;
    list<Edge>::const_iterator iter = edges.begin();
    advance(iter, index);

    return *iter;
}


void Node::addManyEdges(const vector<Edge> input)
{
    for (size_t i = 0; i != input.size(); i++)
    {
         edges.push_back(input(i));
    }
}


list<Edge>::iterator Node::erase(int index)
{
    list<Edge>::iterator iter = edges.begin();
    advance(iter, index);

    return edges.erase(iter);
}


bool Node::operator==(const Node& rhs) const
{
    return label == rhs.getLabel() && equal( edges.begin(), edges.end(), rhs.begin() );  // no need to equate scores
}


bool Node::operator!=(const Node& rhs) const
{
    return !(*this == rhs);
}


// Graph

// constructors
Graph::Graph(): nodes(0) {}
Graph::Graph(const string& file_input): nodes(0)    // constructor from file
{
    string filename(file_input + ".txt");
    string line;
    ifstream is;
    is.open(filename);

    int number_nodes; //, number_edges;
    is >> number_nodes; // >> number_edges;
    nodes.resize(number_nodes);  // reserve the Node vector of size 'number_nodes'

    int node1, node2, cost;
    while (is >> node1 >> node2 >> cost)
    {
        int nodes_array(2) = {node1, node2};
        for (int& node_i : nodes_array) {
            if (nodes(node_i - 1).size() == 1)
                nodes(node_i - 1).setLabel(node_i);
        }

        Edge current_edge(node1, node2, cost);

        nodes(node1 - 1).addEdge(current_edge);
        if (node1 != node2) {
            current_edge.swapNodes();
            nodes(node2 - 1).addEdge(current_edge);
        }
    }

    is.close();
}


// prints all input nodes
void Graph::output() const
{
    for (size_t i = 0; i != nodes.size(); ++i)
    {
        cout << "Node " << nodes(i).getLabel() << ", size = " << nodes(i).edges.size() << "  with edges: ";
        for (size_t j = 0; j != nodes(i).edges.size(); ++j)
        {
            int node_left = nodes(i).getEdge(j).getNodes()(0);
            int node_right = nodes(i).getEdge(j).getNodes()(1);
            int cost = nodes(i).getEdge(j).getCost();
            cout << "(" << node_left << "-" << node_right << ", " << cost << ")   ";
        }
        cout << endl;
    }
}


// prints 10 neighbours around picked node
void Graph::output(const int picked_node) const
{
    for (int i = picked_node - 5; i != picked_node + 5; ++i)
    {
        cout << "Node " << nodes(i).getLabel() << ", with edges: ";
        for (size_t j = 0; j != nodes(i).edges.size(); ++j)
        {
            int node_left = nodes(i).getEdge(j).getNodes()(0);
            int node_right = nodes(i).getEdge(j).getNodes()(1);
            int cost = nodes(i).getEdge(j).getCost();
            int score = nodes(node_right - 1).getScore();
            cout << "(" << node_left << "-" << node_right << ", " << cost << ", " << score << ")   ";
        }
        cout << endl;
    }
}


bool compareCosts(const Edge& a, const Edge& b)
{
    return a.getCost() < b.getCost();
}


bool compareScores(const Node& a, const Node& b)
{
    return a.getScore() > b.getScore();
}


bool compareLabels(const Node& a, const Node& b)
{
    return a.getLabel() > b.getLabel();
}

BFS implementation for Kruskal (alternative to Disjoint Set Kruskal implementation)
BreadthFirstSearch.h

#ifndef BREADTHFIRSTSEARCH_H_INCLUDED
#define BREADTHFIRSTSEARCH_H_INCLUDED

#include <limits>
#include "Graph.h"

int const infinity = std::numeric_limits<int>::infinity();

bool breadthFirstSearch(const Graph&, const int, const int);

#endif // BREADTHFIRSTSEARCH_H_INCLUDED

BreadthFirstSearch.cpp

#include <iostream>
#include <queue>
#include <vector>
#include <map>
#include <algorithm>
#include "BreadthFirstSearch.h"

using std::cout;    using std::endl;
using std::cin;     using std::find_if; 
using std::vector;  using std::queue;
using std::map;

bool breadthFirstSearch(const Graph& G, const int start_node_label, const int target_node_label)
{
    // define type for explored/unexplored Nodes
    enum is_visited {not_visited, visited};
    map<int, is_visited> node_is_visited;
    for (Graph::const_iterator iter = G.begin(); iter != G.end(); ++iter)
        node_is_visited(iter->getLabel()) = not_visited;

    Graph::const_iterator start_node_iter =
           find_if(G.begin(), G.end(), (=)(const Node& i){return i.getLabel() == start_node_label;});
    if ( start_node_iter == G.end() )
        return false;
    node_is_visited(start_node_label) = visited;

    Node next_node;
    next_node = *start_node_iter;

    // breadth-first algorithm runs based on queue structure
    queue<Node> Q;
    Q.push(next_node);

    // BFS algorithm
    while (Q.size() != 0) {     // out of main loop if all nodes searched->means no path is present
        Node current_node = Q.front();
        Node linked_node;       // variable hosting node on other end
        for (size_t i = 0; i != current_node.size(); ++i) {     // explore nodes linked to current_node by an edge
            int linked_node_label = current_node.getEdge(i).getNodes()(1);  
            Graph::const_iterator is_linked_node_in_G =
                       find_if(G.begin(), G.end(), (=)(const Node& a){return a.getLabel() == linked_node_label;});
            if ( is_linked_node_in_G != G.end() ) {    // check linked_node is in G
                linked_node = *is_linked_node_in_G; //G_tot.getNode(linked_node_label - 1);
                if (node_is_visited(linked_node_label) == not_visited) {    // check if linked_node is already in the queue
                    node_is_visited(linked_node_label) = visited;
                    Q.push(linked_node);                                    // if not, add it to the queue
//                    cout << "current " << current_node.getLabel()       // for debug
//                         << " | linked = " << linked_node_label + 1
//                         << " | path length = " << dist(linked_node_label) << endl;
                    if (linked_node_label == target_node_label)  // end search once target node is explored
                        return true;
                }
            } else {
                if (linked_node_label == target_node_label)  // end search once target node is explored
                    return false;
            }
        }
        Q.pop();
    }

    return false;
}

DisjointSet.h

#ifndef DISJOINTSET_H_INCLUDED
#define DISJOINTSET_H_INCLUDED

#include "Graph.h"

class DisjointSet
{
public:
    DisjointSet(const size_t);
    ~DisjointSet();
    DisjointSet& operator= (const DisjointSet&);

    int find(const Node&);
    void unionNodes(const Node&, const Node&);

    int get(int index) {return *leaders(index);}

private:
    size_t size;        // graph size needed for allocation of pointer data members below
    int* base;          // array of int each Node of the graph has its leader
    int** leaders;      // array of pointers to int, allows to reassign leaders to Nodes after unions

    int find_int(int);  // auxiliary to 'find' above

    DisjointSet(const DisjointSet&);    // copy constructor forbidden
};

#endif // DISJOINTSET_H_INCLUDED

DisjointSet.cpp (here is where advice would be most appreciated)

// Union-find structure (lazy unions)
#include "DisjointSet.h"

DisjointSet::DisjointSet(size_t in): size(in), base(new int(in)), leaders(new int*(in))
{
    for (size_t i = 1; i != in + 1; ++i) {
        base(i - 1) = i;
        leaders(i - 1) = &base(i - 1);
    }
}


DisjointSet::~DisjointSet()
{
    delete() base;
    delete() leaders;
}


DisjointSet& DisjointSet::operator= (const DisjointSet& rhs)
{
    if (this == &rhs)
        return *this;       // make sure you aren't self-assigning
    if (base != NULL) {
        delete() leaders;     // get rid of the old data
        delete() base;
    }

    // "copy constructor" from here
    size = rhs.size;
    base = new int(size);
    leaders = new int*(size);
    base = rhs.base;
    for (size_t i = 0; i != size; ++i)
        leaders(i) = &base(i);

    return *this;
}


// auxiliary to find: implements the recursion
int DisjointSet::find_int(int leader_pos)
{
    int parent(leader_pos);
    if (leader_pos != *leaders(leader_pos - 1))
        parent = find_int(*leaders(leader_pos - 1));

    return parent;
}


// returns leader to input Node
int DisjointSet::find(const Node& input)
{
    int parent( input.getLabel() );
    if (input.getLabel() != *leaders(input.getLabel() - 1))
        parent = find_int(*leaders(input.getLabel() - 1));

    return parent;
}


// merges sets by assigning same leader (the lesser of the two Nodes)
void DisjointSet::unionNodes(const Node& a, const Node& b)
{
    if (find(a) != find(b)) {
        if (find(a) < find(b))
            leaders(find(b) - 1) = &base(find(a) - 1);
        else
            leaders(find(a) - 1) = &base(find(b) - 1);
    }
}

KruskalClustering.h

#ifndef KRUSKALCLUSTERING_H_INCLUDED
#define KRUSKALCLUSTERING_H_INCLUDED

#include <vector>
#include "Graph.h"
#include "DisjointSet.h"

int clusteringKruskalNaive(const Graph&, const std::vector<Edge>&, int);
int clusteringKruskalDisjointSet(const Graph& graph0, const std::vector<Edge>& edges, int);

#endif // KRUSKALCLUSTERING_H_INCLUDED

KruskalClustering.cpp

// Kruskal MST algorithm. Implementation specific to  (k=4)-clustering
// -naive (with breadth-first-search to check whether new edge creates a cycle); cost: O(#_edges * #_nodes)
// -and union-find implementations.  Cost: O(#_edges*log2(#_nodes))
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>        //std::find_if
#include "BreadthFirstSearch.h"
#include "KruskalClustering.h"

using std::cout;            using std::endl;
using std::string;          using std::vector;
using std::find_if;

int clusteringKruskalNaive(const Graph& graph0, const vector<Edge>& edges, int k)
{
    Graph T;    // Minimum Spanning Tree
    vector<Edge>::const_iterator edges_iter = edges.begin();
    int sum_costs(0), number_of_clusters( graph0.size() );  // keep track of overall cost of edges in T, and of clusters
    while (number_of_clusters >= k) {

        // find out if first node of edge is already in T
        Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
                    (=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(0) - 1).getLabel();});
        bool is1_in_T_flag;         // needed because T gets increased and thus invalidates iterator is1_in_T
        Node* node_1 = new Node(*edges_iter); // no use of pointer here, it creates a new Node anyway, can't move Nodes to T
        if ( is1_in_T == T.end() ) {      // node_1 not in T so we add it
            T.addNode(*node_1);
            number_of_clusters--;         // node_1 is not its own cluster any more
            delete node_1;                // node_1 copied to T so ...
            node_1 = &(T(T.size() - 1));  // ... it now points there
            sum_costs += (*edges_iter).getCost();
            is1_in_T_flag = false;
        } else {                          // node_1 is in T already
            delete node_1;                // if so, just update the pointer
            node_1 = &(*is1_in_T);
            is1_in_T_flag = true;
        }

        // perform BFS to check for cycles
        bool check_cycles = breadthFirstSearch(T, edges_iter->getNodes()(0), edges_iter->getNodes()(1));

        // create an identical edge, but with nodes positions swapped
        Edge swapped_edge = *edges_iter;
        swapped_edge.swapNodes();
        // find out if second node of edge is already in T
        Graph::iterator is2_in_T = find_if( T.begin(), T.end(),
                    (=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(1) - 1).getLabel();});
        // (either node1 or 2 not in T, or both, or both present but in different clusters of T)
        if (!check_cycles) {         // if edges_iter creates no cycle when added to T
            if (is1_in_T_flag){          // if node_1 was already present in T
                (*node_1).addEdge(*edges_iter);       // just add new edge to node_1 list of edges
                sum_costs += (*edges_iter).getCost();
            } else
                number_of_clusters++;   // if node_1 not present, it means number_of_cl was decreased above ...

            number_of_clusters--;       // ... and number_of_cl can decrease just by one, if adding an edge to T
            if ( is2_in_T != T.end() )  // node_2 already in T: just update its list of edges
                (*is2_in_T).addEdge(swapped_edge);
            else {                      // node_2 not in T, so add it
                Node node_2(swapped_edge);
                T.addNode(node_2);
            }

        } else {                        // cycle created by *edges_iter
            if (!is1_in_T_flag)         // in case the cycle happened just after adding node_1:
                (*is2_in_T).addEdge(swapped_edge);   // add edge to node_2, num_clusters already updated by node_1
        }
        if (number_of_clusters >= k)    // advance to next Edge if num_clusters > k
            edges_iter++;

        // debug
//        T.output();
//        cout << "next edge: (" << (*edges_iter).getNodes()(0) << "-"
//             << (*edges_iter).getNodes()(1) << ") " << endl;
//        cout << "clustering: " << number_of_clusters << endl;
    }
    cout << "Sum of MST lengths is: " << sum_costs << endl;

    return (*edges_iter).getCost();
}


// same algorithm, implemented with Union-find structure
int clusteringKruskalDisjointSet(const Graph& graph0, const vector<Edge>& edges, int k)
{
    DisjointSet disjoint_set_graph0( graph0.size() );       // create Union-find structure

    Graph T;
    vector<Edge>::const_iterator edges_iter = edges.begin();
    int sum_costs(0), number_of_clusters( graph0.size() );

    while ( number_of_clusters >= k ) {
        // if nodes in Edge have not the same leader in the disjoint set, then no loop is created, and T can add the edge
        if ( disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()(0) - 1)) !=
        disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()(1) - 1)) ) {
            sum_costs += (*edges_iter).getCost();
            number_of_clusters--;               // no cycle created so the edge will be added to T

            // look for node_1 in T
            Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
                    (=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(0) - 1).getLabel();});
            if ( is1_in_T == T.end() ) {   // if node_1 not in T add it
                Node node1(*edges_iter);
                T.addNode(node1);
            } else                         // if node_1 already in T only add to it this edge
                (*is1_in_T).addEdge(*edges_iter);

            Edge swapped_edge = *edges_iter;
            swapped_edge.swapNodes();
            // look for node_2 in T
            Graph::iterator is2_in_T = find_if(T.begin(), T.end(),
                    (=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(1) - 1).getLabel();});
            if ( is2_in_T == T.end() ) {     // same as for node_1
                Node node2(swapped_edge);
                T.addNode(node2);
            } else
                (*is2_in_T).addEdge(swapped_edge);

            // merge the 2 nodes' sets: update their disjointed set leaders
            disjoint_set_graph0.unionNodes( graph0.getNode(edges_iter->getNodes()(0) - 1),
                                       graph0.getNode(edges_iter->getNodes()(1) - 1) );
        }

        if (number_of_clusters >= 4)
            edges_iter++;
        //debug
//        T.output();
//        cout << "next edge: (" << (*edges_iter).getNodes()(0) << "-"
//             << (*edges_iter).getNodes()(1) << ") " << endl;
//        cout << "clustering: " << number_of_clusters << endl;
//        for (size_t i = 0; i != graph0.size(); ++i)
//            cout << disjoint_set_graph0.get(i) << " ";
//        cout << endl;
    }
    cout << "Sum of MST lengths is: " << sum_costs << endl;

    return (*edges_iter).getCost();
}

Lastly, main.cpp :

/* A max-spacing k-clustering program based on Kruskal MST algorithm.
The input file lists a complete graph with edge costs.
clusters k = 4 assumed.*/
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>        // std::sort;
#include "Graph.h"
#include "KruskalClustering.h"

using std::cout;                using std::endl;
using std::string;              using std::ifstream;
using std::vector;              using std::sort;

int main(int argc, char** argv)
{
    cout << "Reading list of edges from input file ...n" << endl;

    // read graph0 from input file
    string filename(argv(1));

    Graph graph0(filename);
//    graph0.output();   // debug
    cout << endl;

    // re-read input file and create a list of all edges in graph0
    ifstream is(filename + ".txt");
    int nodes_size;
    is >> nodes_size;
    vector<Edge> edges;
    int node1, node2, cost;

    while (is >> node1 >> node2 >> cost) {
        Edge current_edge(node1, node2, cost);
        edges.push_back(current_edge);
    }
    is.close();

    // sort the edge list by increasing cost
    cout << "Sorting edges ...n" << endl;
    sort(edges.begin(), edges.end(), compareCosts);
//    for (vector<Edge>::iterator iter = edges.begin(); iter != edges.end(); ++iter)       // debug
//        cout << (*iter).getNodes()(0) << " " << (*iter).getNodes()(1) << "  " << (*iter).getCost() << endl;

    cout << "Kruskal algorithm: Computing minimal distance between clusters when they are reduced to 4 ...n" << endl;

    int k = 4;   // number of clusters desired

    // pick implementation, comment the other
    //int clustering_min_dist = clusteringKruskalNaive(graph0, edges, k);
    int clustering_min_dist = clusteringKruskalDisjointSet(graph0, edges, k);

    cout << "k = " << k << " clusters minimal distance is: " << clustering_min_dist << endl;

return 0;
}