# c++ – K-clustering algorithm using Kruskal MST with Disjoint Set in place to check for cycles

here below a working implementation that finds the minimal distance between k(set `=4` below) clusters in a graph.
I have doubts mainly on the implementation of the Disjoint Set structure: instead of using STL containers, I went along with an array of pointers as data structure hosting the leader nodes, in order to have some practice with C-style pointers.

Here is the Graph structure:
Graph.h

``````// creates and manages a graph data structure
#ifndef GRAPH_H_INCLUDED
#define GRAPH_H_INCLUDED

#include <array>
#include <vector>
#include <list>
#include <fstream>
#include <string>
#include <iostream>

class Edge
{
public:
// constructors
Edge();
Edge(int, int, int);

std::array<int, 2> getNodes() const;
void setNodes(const int, const int);
int getCost() const;
void setCost(const int);
void swapNodes();
// a get() that allows writing:
int operator()(const int);

bool operator==(const Edge&) const;
bool operator!=(const Edge&) const;

private:
int node1, node2;       // nodes are just indices of the nodes in the graph
int cost;
};

class Node
{
friend class Graph;       // friendship needed by printing routine in Graph
public:
// constructors
Node();
Node(const Edge&);

int getLabel() const {return label;}
Edge getEdge(const int) const;
void setLabel(const int in) {label = in;}
int getScore() const {return score;}
void setScore(const int in) {score = in;}

std::list<Edge>::size_type size() const {return edges.size();}  // size of list 'edges'

// iterators
typedef std::list<Edge>::iterator iterator;
typedef std::list<Edge>::const_iterator const_iterator;
std::list<Edge>::iterator begin() {return edges.begin();}
std::list<Edge>::iterator end() {return edges.end();}
std::list<Edge>::const_iterator cbegin() const {return edges.begin();}
std::list<Edge>::const_iterator begin() const {return edges.begin();}
std::list<Edge>::const_iterator cend() const {return edges.end();}
std::list<Edge>::const_iterator end() const {return edges.end();}

// inserts group of edges to the end of a edges vector
std::list<Edge>::iterator insertEdges(const std::list<Edge>::iterator beg_in,
const std::list<Edge>::iterator end_in)
{
return edges.insert(end(), beg_in, end_in);
}
// erase node
std::list<Edge>::iterator erase(int);

bool operator==(const Node&) const;
bool operator!=(const Node&) const;

private:
int label;
std::list<Edge> edges;
int score;          // new, starts at 10000000, is equal to lowest cost Edge in 'edges'
};

class Graph
{
public:
Graph();
// constructor from txt file
Graph(const std::string&);

Node getNode(const int index) const {return nodes(index);}

std::vector<Node>::size_type size() {return nodes.size();}  // size of vector 'nodes'
std::vector<Node>::size_type size() const {return nodes.size();}  // size of vector 'nodes'
void output() const;      // prints graph
void output(const int) const;

// iterators
typedef std::vector<Node>::iterator iterator;
typedef std::vector<Node>::const_iterator const_iterator;
std::vector<Node>::iterator begin() {return nodes.begin();}
std::vector<Node>::iterator end() {return nodes.end();}
std::vector<Node>::const_iterator begin() const {return nodes.begin();}
std::vector<Node>::const_iterator end() const {return nodes.end();}

Node& operator()(const int index)
{
return nodes(index);
}

std::vector<Node>::iterator erase(const int index)
{
return nodes.erase(nodes.begin() + index);
}

private:
std::vector<Node> nodes;
};

bool compareCosts(const Edge&, const Edge&);
bool compareScores(const Node&, const Node&);
bool compareLabels(const Node&, const Node&);

#endif // GRAPH_H_INCLUDED
``````

Graph.cpp:

``````#include <iostream>
#include <fstream>
#include <array>
#include <list>
#include <string>
#include <algorithm>
#include "Graph.h"

using std::array;       using std::ifstream;
using std::string;      using std::endl;
using std::cout;        using std::list;
using std::equal;

// Edge

// constructors
Edge::Edge():node1(0), node2(0), cost(0) {}
Edge::Edge(int node_1, int node_2, int len): node1(node_1), node2(node_2), cost(len) {}

array<int, 2> Edge::getNodes() const
{
array<int, 2> ret = {node1, node2};

return ret;
}

void Edge::setNodes(const int in1, const int in2)
{
node1 = in1;
node2 = in2;
}

int Edge::getCost() const
{
return cost;
}

void Edge::setCost(const int len)
{
cost = len;
}

void Edge::swapNodes()
{
node1 = node1 - node2;
node2 += node1;
node1 = node2 - node1;
}

// same as getNodes() above
int Edge::operator()(const int index)
{
if (index == 0) return node1;
else if (index == 1) return node2;
else {
try {throw;}
catch(...) {cout << "edge index must be either 0 or 1" << endl;}
return 1;
}
}

bool Edge::operator==(const Edge& rhs) const
{
if ( (node1 == rhs.getNodes()(0)) && (node2 == rhs.getNodes()(1)) && cost == rhs.getCost() )
return true;
else
return false;
}

bool Edge::operator!=(const Edge& rhs) const
{
if ( !(*this == rhs) )
return true;
else
return false;
}

// Node

//constructors
Node::Node(): label(0), edges(0), score(10000000) {}
Node::Node(const Edge& edg): label(edg.getNodes()(0)), edges(0), score(10000000)
{
edges.push_back(edg);
}

Edge Node::getEdge(const int index) const
{
Edge ret;
list<Edge>::const_iterator iter = edges.begin();

return *iter;
}

{
for (size_t i = 0; i != input.size(); i++)
{
edges.push_back(input(i));
}
}

list<Edge>::iterator Node::erase(int index)
{
list<Edge>::iterator iter = edges.begin();

return edges.erase(iter);
}

bool Node::operator==(const Node& rhs) const
{
return label == rhs.getLabel() && equal( edges.begin(), edges.end(), rhs.begin() );  // no need to equate scores
}

bool Node::operator!=(const Node& rhs) const
{
return !(*this == rhs);
}

// Graph

// constructors
Graph::Graph(): nodes(0) {}
Graph::Graph(const string& file_input): nodes(0)    // constructor from file
{
string filename(file_input + ".txt");
string line;
ifstream is;
is.open(filename);

int number_nodes; //, number_edges;
is >> number_nodes; // >> number_edges;
nodes.resize(number_nodes);  // reserve the Node vector of size 'number_nodes'

int node1, node2, cost;
while (is >> node1 >> node2 >> cost)
{
int nodes_array(2) = {node1, node2};
for (int& node_i : nodes_array) {
if (nodes(node_i - 1).size() == 1)
nodes(node_i - 1).setLabel(node_i);
}

Edge current_edge(node1, node2, cost);

if (node1 != node2) {
current_edge.swapNodes();
}
}

is.close();
}

// prints all input nodes
void Graph::output() const
{
for (size_t i = 0; i != nodes.size(); ++i)
{
cout << "Node " << nodes(i).getLabel() << ", size = " << nodes(i).edges.size() << "  with edges: ";
for (size_t j = 0; j != nodes(i).edges.size(); ++j)
{
int node_left = nodes(i).getEdge(j).getNodes()(0);
int node_right = nodes(i).getEdge(j).getNodes()(1);
int cost = nodes(i).getEdge(j).getCost();
cout << "(" << node_left << "-" << node_right << ", " << cost << ")   ";
}
cout << endl;
}
}

// prints 10 neighbours around picked node
void Graph::output(const int picked_node) const
{
for (int i = picked_node - 5; i != picked_node + 5; ++i)
{
cout << "Node " << nodes(i).getLabel() << ", with edges: ";
for (size_t j = 0; j != nodes(i).edges.size(); ++j)
{
int node_left = nodes(i).getEdge(j).getNodes()(0);
int node_right = nodes(i).getEdge(j).getNodes()(1);
int cost = nodes(i).getEdge(j).getCost();
int score = nodes(node_right - 1).getScore();
cout << "(" << node_left << "-" << node_right << ", " << cost << ", " << score << ")   ";
}
cout << endl;
}
}

bool compareCosts(const Edge& a, const Edge& b)
{
return a.getCost() < b.getCost();
}

bool compareScores(const Node& a, const Node& b)
{
return a.getScore() > b.getScore();
}

bool compareLabels(const Node& a, const Node& b)
{
return a.getLabel() > b.getLabel();
}
``````

BFS implementation for Kruskal (alternative to Disjoint Set Kruskal implementation)

``````#ifndef BREADTHFIRSTSEARCH_H_INCLUDED

#include <limits>
#include "Graph.h"

int const infinity = std::numeric_limits<int>::infinity();

bool breadthFirstSearch(const Graph&, const int, const int);

``````

``````#include <iostream>
#include <queue>
#include <vector>
#include <map>
#include <algorithm>

using std::cout;    using std::endl;
using std::cin;     using std::find_if;
using std::vector;  using std::queue;
using std::map;

bool breadthFirstSearch(const Graph& G, const int start_node_label, const int target_node_label)
{
// define type for explored/unexplored Nodes
enum is_visited {not_visited, visited};
map<int, is_visited> node_is_visited;
for (Graph::const_iterator iter = G.begin(); iter != G.end(); ++iter)
node_is_visited(iter->getLabel()) = not_visited;

Graph::const_iterator start_node_iter =
find_if(G.begin(), G.end(), (=)(const Node& i){return i.getLabel() == start_node_label;});
if ( start_node_iter == G.end() )
return false;
node_is_visited(start_node_label) = visited;

Node next_node;
next_node = *start_node_iter;

// breadth-first algorithm runs based on queue structure
queue<Node> Q;
Q.push(next_node);

// BFS algorithm
while (Q.size() != 0) {     // out of main loop if all nodes searched->means no path is present
Node current_node = Q.front();
Node linked_node;       // variable hosting node on other end
for (size_t i = 0; i != current_node.size(); ++i) {     // explore nodes linked to current_node by an edge
find_if(G.begin(), G.end(), (=)(const Node& a){return a.getLabel() == linked_node_label;});
if ( is_linked_node_in_G != G.end() ) {    // check linked_node is in G
//                    cout << "current " << current_node.getLabel()       // for debug
//                         << " | path length = " << dist(linked_node_label) << endl;
if (linked_node_label == target_node_label)  // end search once target node is explored
return true;
}
} else {
if (linked_node_label == target_node_label)  // end search once target node is explored
return false;
}
}
Q.pop();
}

return false;
}
``````

DisjointSet.h

``````#ifndef DISJOINTSET_H_INCLUDED
#define DISJOINTSET_H_INCLUDED

#include "Graph.h"

class DisjointSet
{
public:
DisjointSet(const size_t);
~DisjointSet();
DisjointSet& operator= (const DisjointSet&);

int find(const Node&);
void unionNodes(const Node&, const Node&);

private:
size_t size;        // graph size needed for allocation of pointer data members below
int* base;          // array of int each Node of the graph has its leader
int** leaders;      // array of pointers to int, allows to reassign leaders to Nodes after unions

int find_int(int);  // auxiliary to 'find' above

DisjointSet(const DisjointSet&);    // copy constructor forbidden
};

#endif // DISJOINTSET_H_INCLUDED
``````

DisjointSet.cpp (here is where advice would be most appreciated)

``````// Union-find structure (lazy unions)
#include "DisjointSet.h"

DisjointSet::DisjointSet(size_t in): size(in), base(new int(in)), leaders(new int*(in))
{
for (size_t i = 1; i != in + 1; ++i) {
base(i - 1) = i;
leaders(i - 1) = &base(i - 1);
}
}

DisjointSet::~DisjointSet()
{
delete() base;
}

DisjointSet& DisjointSet::operator= (const DisjointSet& rhs)
{
if (this == &rhs)
return *this;       // make sure you aren't self-assigning
if (base != NULL) {
delete() leaders;     // get rid of the old data
delete() base;
}

// "copy constructor" from here
size = rhs.size;
base = new int(size);
base = rhs.base;
for (size_t i = 0; i != size; ++i)

return *this;
}

// auxiliary to find: implements the recursion
{

return parent;
}

// returns leader to input Node
int DisjointSet::find(const Node& input)
{
int parent( input.getLabel() );
if (input.getLabel() != *leaders(input.getLabel() - 1))

return parent;
}

// merges sets by assigning same leader (the lesser of the two Nodes)
void DisjointSet::unionNodes(const Node& a, const Node& b)
{
if (find(a) != find(b)) {
if (find(a) < find(b))
leaders(find(b) - 1) = &base(find(a) - 1);
else
leaders(find(a) - 1) = &base(find(b) - 1);
}
}
``````

KruskalClustering.h

``````#ifndef KRUSKALCLUSTERING_H_INCLUDED
#define KRUSKALCLUSTERING_H_INCLUDED

#include <vector>
#include "Graph.h"
#include "DisjointSet.h"

int clusteringKruskalNaive(const Graph&, const std::vector<Edge>&, int);
int clusteringKruskalDisjointSet(const Graph& graph0, const std::vector<Edge>& edges, int);

#endif // KRUSKALCLUSTERING_H_INCLUDED
``````

KruskalClustering.cpp

``````// Kruskal MST algorithm. Implementation specific to  (k=4)-clustering
// -naive (with breadth-first-search to check whether new edge creates a cycle); cost: O(#_edges * #_nodes)
// -and union-find implementations.  Cost: O(#_edges*log2(#_nodes))
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>        //std::find_if
#include "KruskalClustering.h"

using std::cout;            using std::endl;
using std::string;          using std::vector;
using std::find_if;

int clusteringKruskalNaive(const Graph& graph0, const vector<Edge>& edges, int k)
{
Graph T;    // Minimum Spanning Tree
vector<Edge>::const_iterator edges_iter = edges.begin();
int sum_costs(0), number_of_clusters( graph0.size() );  // keep track of overall cost of edges in T, and of clusters
while (number_of_clusters >= k) {

// find out if first node of edge is already in T
Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
(=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(0) - 1).getLabel();});
bool is1_in_T_flag;         // needed because T gets increased and thus invalidates iterator is1_in_T
Node* node_1 = new Node(*edges_iter); // no use of pointer here, it creates a new Node anyway, can't move Nodes to T
if ( is1_in_T == T.end() ) {      // node_1 not in T so we add it
number_of_clusters--;         // node_1 is not its own cluster any more
delete node_1;                // node_1 copied to T so ...
node_1 = &(T(T.size() - 1));  // ... it now points there
sum_costs += (*edges_iter).getCost();
is1_in_T_flag = false;
} else {                          // node_1 is in T already
delete node_1;                // if so, just update the pointer
node_1 = &(*is1_in_T);
is1_in_T_flag = true;
}

// perform BFS to check for cycles
bool check_cycles = breadthFirstSearch(T, edges_iter->getNodes()(0), edges_iter->getNodes()(1));

// create an identical edge, but with nodes positions swapped
Edge swapped_edge = *edges_iter;
swapped_edge.swapNodes();
// find out if second node of edge is already in T
Graph::iterator is2_in_T = find_if( T.begin(), T.end(),
(=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(1) - 1).getLabel();});
// (either node1 or 2 not in T, or both, or both present but in different clusters of T)
if (!check_cycles) {         // if edges_iter creates no cycle when added to T
if (is1_in_T_flag){          // if node_1 was already present in T
sum_costs += (*edges_iter).getCost();
} else
number_of_clusters++;   // if node_1 not present, it means number_of_cl was decreased above ...

number_of_clusters--;       // ... and number_of_cl can decrease just by one, if adding an edge to T
if ( is2_in_T != T.end() )  // node_2 already in T: just update its list of edges
else {                      // node_2 not in T, so add it
Node node_2(swapped_edge);
}

} else {                        // cycle created by *edges_iter
if (!is1_in_T_flag)         // in case the cycle happened just after adding node_1:
}
if (number_of_clusters >= k)    // advance to next Edge if num_clusters > k
edges_iter++;

// debug
//        T.output();
//        cout << "next edge: (" << (*edges_iter).getNodes()(0) << "-"
//             << (*edges_iter).getNodes()(1) << ") " << endl;
//        cout << "clustering: " << number_of_clusters << endl;
}
cout << "Sum of MST lengths is: " << sum_costs << endl;

return (*edges_iter).getCost();
}

// same algorithm, implemented with Union-find structure
int clusteringKruskalDisjointSet(const Graph& graph0, const vector<Edge>& edges, int k)
{
DisjointSet disjoint_set_graph0( graph0.size() );       // create Union-find structure

Graph T;
vector<Edge>::const_iterator edges_iter = edges.begin();
int sum_costs(0), number_of_clusters( graph0.size() );

while ( number_of_clusters >= k ) {
// if nodes in Edge have not the same leader in the disjoint set, then no loop is created, and T can add the edge
if ( disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()(0) - 1)) !=
disjoint_set_graph0.find(graph0.getNode(edges_iter->getNodes()(1) - 1)) ) {
sum_costs += (*edges_iter).getCost();
number_of_clusters--;               // no cycle created so the edge will be added to T

// look for node_1 in T
Graph::iterator is1_in_T = find_if(T.begin(), T.end(),
(=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(0) - 1).getLabel();});
if ( is1_in_T == T.end() ) {   // if node_1 not in T add it
Node node1(*edges_iter);
} else                         // if node_1 already in T only add to it this edge

Edge swapped_edge = *edges_iter;
swapped_edge.swapNodes();
// look for node_2 in T
Graph::iterator is2_in_T = find_if(T.begin(), T.end(),
(=) (Node& a) {return a.getLabel() == graph0.getNode(edges_iter->getNodes()(1) - 1).getLabel();});
if ( is2_in_T == T.end() ) {     // same as for node_1
Node node2(swapped_edge);
} else

// merge the 2 nodes' sets: update their disjointed set leaders
disjoint_set_graph0.unionNodes( graph0.getNode(edges_iter->getNodes()(0) - 1),
graph0.getNode(edges_iter->getNodes()(1) - 1) );
}

if (number_of_clusters >= 4)
edges_iter++;
//debug
//        T.output();
//        cout << "next edge: (" << (*edges_iter).getNodes()(0) << "-"
//             << (*edges_iter).getNodes()(1) << ") " << endl;
//        cout << "clustering: " << number_of_clusters << endl;
//        for (size_t i = 0; i != graph0.size(); ++i)
//            cout << disjoint_set_graph0.get(i) << " ";
//        cout << endl;
}
cout << "Sum of MST lengths is: " << sum_costs << endl;

return (*edges_iter).getCost();
}
``````

Lastly, main.cpp :

``````/* A max-spacing k-clustering program based on Kruskal MST algorithm.
The input file lists a complete graph with edge costs.
clusters k = 4 assumed.*/
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>        // std::sort;
#include "Graph.h"
#include "KruskalClustering.h"

using std::cout;                using std::endl;
using std::string;              using std::ifstream;
using std::vector;              using std::sort;

int main(int argc, char** argv)
{
cout << "Reading list of edges from input file ...n" << endl;

// read graph0 from input file
string filename(argv(1));

Graph graph0(filename);
//    graph0.output();   // debug
cout << endl;

// re-read input file and create a list of all edges in graph0
ifstream is(filename + ".txt");
int nodes_size;
is >> nodes_size;
vector<Edge> edges;
int node1, node2, cost;

while (is >> node1 >> node2 >> cost) {
Edge current_edge(node1, node2, cost);
edges.push_back(current_edge);
}
is.close();

// sort the edge list by increasing cost
cout << "Sorting edges ...n" << endl;
sort(edges.begin(), edges.end(), compareCosts);
//    for (vector<Edge>::iterator iter = edges.begin(); iter != edges.end(); ++iter)       // debug
//        cout << (*iter).getNodes()(0) << " " << (*iter).getNodes()(1) << "  " << (*iter).getCost() << endl;

cout << "Kruskal algorithm: Computing minimal distance between clusters when they are reduced to 4 ...n" << endl;

int k = 4;   // number of clusters desired

// pick implementation, comment the other
//int clustering_min_dist = clusteringKruskalNaive(graph0, edges, k);
int clustering_min_dist = clusteringKruskalDisjointSet(graph0, edges, k);

cout << "k = " << k << " clusters minimal distance is: " << clustering_min_dist << endl;

return 0;
}
``````