PROWAREtech
C++: Machine Learning, Unsupervised Learning or Clustering, K-mean / Silhouette Clustering Library
Unsupervised learning is what we humans do all the time, and it refers to algorithms that learn patterns from unlabeled data. Supervised learning is a machine learning paradigm for problems where the available data consists of labeled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that maps feature vectors (inputs) to labels (output), based on example input-output pairs.
See this article for a supervised machine learning library written in C++.
K-mean Clustering & Silhouette
K-mean clustering is a method of vector quantization that aims to partition x number of observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.
The first center points, called centroids, are randomly chosen and then the data is re-evaluated to determine the new centroid positions and so on until the centroid positions do not change.
Silhouette refers to a method of interpretation and validation of consistency within clusters of data. The technique provides a succinct graphical representation of how well each object has been classified.
This code can use pure pseudorandom centroid selection or it can use k-means++ centroid selection. It also has the option to remove data outliers.
Visit the playground for related.
TODO: Integrate with Nvidia CUDA for a boost in processing power!
This code is also available in C#, but it does take about twice as long to run.
Download all this code and more: KMEANCLUSTERING.zip
Example Usage
// main.cpp
#include "kmean.h"
int main()
{
int K = 14; // the K-MAX value to set and the K that was used is returned in this variable
float KScore;
unsigned int count;
KMean::ClusteredPoint* points;
bool remove_outliers = true;
bool use_kmeanspp = true;
bool multithreaded = true;
// can also copy an array of the data points (with cluster information) to be used for analysis
KMean::FindBestKFromFile(remove_outliers, use_kmeanspp, multithreaded, "data.csv", 5, false, 0, 1, -1, -1, &K, &KScore, &count, (void**)&points);
for(unsigned int i = 0; i < count; i++)
{
int cluster_id = points[i].cid;
float x = points[i].x;
float y = points[i].y;
float z = points[i].z;
float w = points[i].w;
}
KMean::FreeMemory(points);
return 0;
}
Library Source Code
// kmean.h
#include <thread>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <time.h>
#include <limits>
#include <cmath>
#include <string>
#include <vector>
#include "csvparser.h"
#ifndef KMEAN_H
#define KMEAN_H
#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) < (Y)) ? (Y) : (X))
#define MIN_FLOAT (-std::numeric_limits<float>::max())
#define MAX_FLOAT (std::numeric_limits<float>::max())
namespace KMean
{
const int KMIN = 2;
struct Point
{
float x, y, z, w; // should be zero if not using the data
Point()
{
x = y = z = w = 0;
}
Point(Point* pt)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
Point(float x, float y, float z, float w)
{
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
};
struct ClusteredPoint : public Point
{
int cid;
ClusteredPoint() : cid(-1)
{
x = y = z = w = 0;
}
ClusteredPoint(Point* pt) : cid(-1)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
ClusteredPoint(float x, float y, float z, float w) : cid(-1)
{
this->x = x;
this->y = y;
this->z = z;
this->w = w;
}
};
struct DistancePoint : public Point
{
float distance;
DistancePoint(Point* pt, float distance) : distance(distance)
{
x = pt->x;
y = pt->y;
z = pt->z;
w = pt->w;
}
};
namespace Euclidean
{
// Euclidean distance formula between four points: distance = square_root((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1) + (w2 - w1) * (w2 - w1))
float FindDistanceBetweenPoints(const Point* const p1, const Point* const p2)
{
return (float)sqrt((p2->x - p1->x) * (p2->x - p1->x) + (p2->y - p1->y) * (p2->y - p1->y) + (p2->z - p1->z) * (p2->z - p1->z) + (p2->w - p1->w) * (p2->w - p1->w));
}
double FindDistanceBetweenPointsSquared(const Point* const p1, const Point* const p2)
{
return ((double)p2->x - (double)p1->x) * ((double)p2->x - (double)p1->x) + ((double)p2->y - (double)p1->y) * ((double)p2->y - (double)p1->y) + ((double)p2->z - (double)p1->z) * ((double)p2->z - (double)p1->z) + ((double)p2->w - (double)p1->w) * ((double)p2->w - (double)p1->w);
}
}
namespace Sorting
{
void qsort_distance_points(DistancePoint** const arr, const std::size_t size)
{
if (size <= 30)
{
// insertion sort algorithm
for (std::size_t i = 1; i < size; i++) {
DistancePoint* temp = arr[i];
std::size_t j = i;
while (j > 0 && temp->distance < arr[j - 1]->distance) {
arr[j] = arr[j - 1];
j--;
}
arr[j] = temp;
}
}
else
{
// quick sort algorithm
double pivot = arr[size / 2]->distance;
DistancePoint** left = arr;
DistancePoint** right = arr + size - 1;
while (true) {
while (left <= right && (*left)->distance < pivot) left++;
while (left <= right && (*right)->distance > pivot) right--;
if (left > right)
break;
DistancePoint* temp = *left;
*left++ = *right;
*right-- = temp;
}
qsort_distance_points(arr, right - arr + 1);
qsort_distance_points(left, arr + size - left);
}
}
}
class Silhouette
{
private:
static int FindNearestNeighboringClusterId(std::vector<Point*>& centroids, Point* centroid)
{
int id = -1;
float min = std::numeric_limits<float>::max();
std::size_t count = centroids.size();
for (int i = 0; i < count; i++)
{
float dist = Euclidean::FindDistanceBetweenPoints(centroids[i], centroid);
if (dist > 0 && dist <= min)
{
id = i;
min = dist;
}
}
return id;
}
public:
// Silhouette Formula: S(i) = [b(i) - a(i)] / MAX(a(i), b(i))
// a(i) = the average of the distance between point i and all other data points in its cluster
// b(i) = the average of the distance between point i and all other data points outside its cluster
static float CalculateKScore(std::vector<ClusteredPoint*>& points, std::vector<Point*>& centroids)
{
const std::size_t cluster_count = centroids.size();
double a_total = 0;
double b_total = 0;
int a_count = 0;
int b_count = 0;
const std::size_t count = points.size();
for (std::size_t cid = 0; cid < cluster_count; cid++)
{
for (int j = 0; j < count; j++)
{
if (points[j]->cid == cid)
{
a_count++;
a_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
}
else if (points[j]->cid == FindNearestNeighboringClusterId(centroids, centroids[cid]))
{
b_count++;
b_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
}
}
}
float a, b;
if (a_count == 0)
a = 0;
else
a = a_total / a_count;
if (b_count == 0)
b = 0;
else
b = b_total / b_count;
float max = MAX(a, b);
if (max == 0)
return MIN_FLOAT;
return (b - a) / max;
}
};
class RawData
{
private:
const int columnX, columnY, columnZ, columnW;
public:
std::string labelX, labelY, labelZ, labelW;
std::vector<Point*> points;
RawData(const int colX, const int colY, const int colZ, const int colW) : columnX(colX), columnY(colY), columnZ(colZ), columnW(colW) {}
bool Initialize(const int remove_outliers, CSVParser::CSVNumericData& parsedCSVData)
{
std::vector<std::string> columnNames = parsedCSVData.column_names();
if (columnX < columnNames.size())
labelX = columnNames[columnX];
if (columnY >= 0 && columnY < columnNames.size())
labelY = columnNames[columnY];
if (columnZ >= 0 && columnZ < columnNames.size())
labelZ = columnNames[columnZ];
if (columnW >= 0 && columnW < columnNames.size())
labelW = columnNames[columnW];
std::size_t col_count;
for (int row = 0; row < parsedCSVData.row_count(); row++)
{
col_count = parsedCSVData[row].size();
if (columnX < col_count && (columnY < 0 || columnY < col_count) && (columnZ < 0 || columnZ < col_count) && (columnW < 0 || columnW < col_count))
{
Point* pt = new Point(
parsedCSVData[row][columnX],
columnY < 0 ? 0 : parsedCSVData[row][columnY],
columnZ < 0 ? 0 : parsedCSVData[row][columnZ],
columnW < 0 ? 0 : parsedCSVData[row][columnW]);
points.push_back(pt);
}
}
std::size_t count = points.size();
if (count && remove_outliers) // use interquartile range to find outliers
{
DistancePoint** dp_array = new DistancePoint * [count];
// find average distance of every point from each other point
for (std::size_t i = 0; i < count; i++)
{
double total_distance = 0;
for (std::size_t j = 0; j < count; j++)
if (i != j)
total_distance += Euclidean::FindDistanceBetweenPoints(points[j], points[i]);
dp_array[i] = new DistancePoint(points[i], total_distance / count);
}
Sorting::qsort_distance_points(dp_array, count);
DistancePoint* Q1 = dp_array[count / 4];
DistancePoint* Q3 = dp_array[count / 2 + count / 4 + (count & 1)];
double IQR = Q3->distance - Q1->distance;
double upper_fence = Q3->distance + 1.5 * IQR;
double lower_fence = Q1->distance - 1.5 * IQR;
std::size_t len = count;
for (std::size_t i = 0; i < len; )
{
if (dp_array[i]->distance < lower_fence || dp_array[i]->distance > upper_fence)
{
delete dp_array[i];
dp_array[i] = nullptr;
len--;
for (std::size_t j = i; j < len; j++)
dp_array[j] = dp_array[j + 1];
}
else
i++;
}
for (Point* pt : points)
delete pt;
points.clear();
for (std::size_t i = 0; i < len; i++)
{
points.push_back(new Point(dp_array[i]));
delete dp_array[i];
}
delete[] dp_array;
count = len;
}
return count > 0;
}
~RawData()
{
for (Point* p : points)
{
if (p)
delete p;
}
}
};
class Data
{
private:
float score;
double GetNearestCentroidDistance(Point* pt, std::vector<Point*>& centroids)
{
double d, min_dist = std::numeric_limits<float>::max();
std::size_t centroid_count = centroids.size();
for (std::size_t i = 0; i < centroid_count; i++)
{
d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
if (d < min_dist)
min_dist = d;
}
return min_dist;
}
void kMeanspp()
{
std::size_t point_count = points.size();
centroids.push_back(new Point(points[rand() % point_count]));
double* distances = new double[point_count];
double sum = 0;
for (std::size_t index = 1; index < K; index++)
{
for (std::size_t i = 0; i < point_count; i++)
sum += (distances[i] = GetNearestCentroidDistance(points[i], centroids));
sum *= rand() / (float)RAND_MAX;
for (std::size_t i = 0; i < point_count; i++)
{
sum -= distances[i];
if (sum < 0)
{
centroids.push_back(new Point(points[i]));
break;
}
}
}
delete[] distances;
}
static bool IsChosen(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count, const std::size_t index)
{
for (int i = 0; i < K; i++)
{
if (chosen_indexes_to_centroid[i] == count)
break;
if (chosen_indexes_to_centroid[i] == index)
return true;
}
return false;
}
static std::size_t FindUnchosenRandomIndex(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count)
{
std::size_t index = rand() % count;
while (true)
{
while (IsChosen(chosen_indexes_to_centroid, K, count, index))
index++;
if (index < count)
break;
index = rand() % count;
}
return index;
}
public:
const int K;
std::vector<Point*> centroids;
std::vector<ClusteredPoint*> points;
float KScore()
{
if (score == MIN_FLOAT)
score = Silhouette::CalculateKScore(points, centroids);
return score;
}
Data(const int K) : score(MIN_FLOAT), K(K) {}
bool Initialize(const bool use_kmeanspp, RawData* raw_data)
{
bool ret = false;
if (raw_data && raw_data->points.size())
{
std::size_t count = raw_data->points.size();
for (std::size_t i = 0; i < count; i++)
points.push_back(new ClusteredPoint(raw_data->points[i]));
if (use_kmeanspp) // use k-means++ method to find initial centroids
{
kMeanspp();
}
else // use total random method to choose centroids
{
std::size_t* const chosen_indexes_to_centroid = new std::size_t[K];
for (int i = 0; i < K; i++)
chosen_indexes_to_centroid[i] = count;
for (int i = 0; i < K; i++)
{
chosen_indexes_to_centroid[i] = FindUnchosenRandomIndex(chosen_indexes_to_centroid, K, count);
centroids.push_back(new Point(points[chosen_indexes_to_centroid[i]]));
}
delete[] chosen_indexes_to_centroid;
}
ret = true;
}
return ret;
}
~Data()
{
for (Point* c : centroids)
{
if (c)
delete c;
}
for (ClusteredPoint* p : points)
{
if (p)
delete p;
}
}
};
class Clustering
{
public:
static std::vector<Data*> Model(const bool use_kmeanspp, const int K, RawData* raw_data)
{
std::vector<Data*> dataArray;
for (int i = 0; i < K; i++)
{
Data* data = new Data(K);
if (!data->Initialize(use_kmeanspp, raw_data))
delete data;
else
{
std::vector<Point*> temp_points;
while (true)
{
for (ClusteredPoint* point : data->points)
point->cid = GetNearestCentroidIndex(point, data->centroids);
CopyPointArrays(data->centroids, temp_points);
RecenterCentroids(data);
if (PointArraysAreEqual(data->centroids, temp_points))
break;
}
dataArray.push_back(data);
}
}
return dataArray;
}
static void FindBestModelAndDelete(std::vector<std::vector<KMean::Data*>>& models, int* pK, float* pKScore, unsigned int* data_point_count, void** data_points)
{
Data *best = nullptr;
float kscore = MIN_FLOAT;
for (std::size_t i = 0; i < models.size(); i++)
{
for (std::size_t j = 0; j < models[i].size(); j++)
if (models[i][j]->KScore() > kscore)
{
kscore = models[i][j]->KScore();
best = models[i][j];
}
}
if (best)
{
if (pKScore)
*pKScore = best->KScore();
if (pK)
*pK = best->K;
if (data_point_count)
{
std::size_t count = best->points.size();
(*data_point_count) = (unsigned int)count;
if (data_points)
{
ClusteredPoint* points = new ClusteredPoint[count];
for (std::size_t i = 0; i < count; i++)
{
points[i].cid = best->points[i]->cid;
points[i].x = best->points[i]->x;
points[i].y = best->points[i]->y;
points[i].z = best->points[i]->z;
points[i].w = best->points[i]->w;
}
*data_points = points;
}
}
}
else
{
if (pKScore)
*pKScore = MIN_FLOAT;
if (pK)
*pK = 0;
if (data_point_count)
{
(*data_point_count) = 0;
if (data_points)
*data_points = nullptr;
}
}
for (std::size_t i = 0; i < models.size(); i++)
{
for (std::size_t j = 0; j < models[i].size(); j++)
delete models[i][j];
}
}
private:
static std::size_t GetNearestCentroidIndex(Point* pt, std::vector<Point*>& centroids)
{
double d, min_dist = std::numeric_limits<float>::max();
std::size_t centroid_count = centroids.size();
std::size_t min_index = centroid_count;
for (std::size_t i = 0; i < centroid_count; i++)
{
d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
if (d < min_dist)
{
min_dist = d;
min_index = i;
}
}
return min_index;
}
static void CopyPointArrays(std::vector<Point*> &src, std::vector<Point*> &dst)
{
if (dst.size() < src.size())
{
for (std::size_t i = 0; i < src.size(); i++)
{
Point* pt = new Point(src[i]);
pt->x = src[i]->x;
pt->y = src[i]->y;
pt->z = src[i]->z;
pt->w = src[i]->w;
dst.push_back(pt);
}
}
else
{
for (std::size_t i = 0; i < src.size(); i++)
{
dst[i]->x = src[i]->x;
dst[i]->y = src[i]->y;
dst[i]->z = src[i]->z;
dst[i]->w = src[i]->w;
}
}
}
static bool PointArraysAreEqual(std::vector<Point*> &points1, std::vector<Point*> &points2)
{
std::size_t count = points1.size();
for (std::size_t i = 0; i < points1.size() && count > 0; i++)
{
for (std::size_t j = 0; j < points2.size() && count > 0; j++)
{
if (points2[j]->x == points1[i]->x && points2[j]->y == points1[i]->y && points2[j]->z == points1[i]->z && points2[j]->w == points1[i]->w)
count--;
}
}
return count == 0;
}
static void RecenterCentroids(Data* data)
{
const int K = data->K;
for (int i = 0; i < K; i++)
{
int c = 0;
double x_total = 0;
double y_total = 0;
double z_total = 0;
double w_total = 0;
for (ClusteredPoint* point : data->points)
{
if (point->cid == i)
{
c++;
x_total += point->x;
y_total += point->y;
z_total += point->z;
w_total += point->w;
}
}
if (c == 0)
data->centroids[i]->x = data->centroids[i]->y = data->centroids[i]->z = data->centroids[i]->w = 0;
else
{
data->centroids[i]->x = x_total / c;
data->centroids[i]->y = y_total / c;
data->centroids[i]->z = z_total / c;
data->centroids[i]->w = w_total / c;
}
}
}
};
struct thread_params
{
int active;
const bool use_kmeanspp;
const int K;
std::vector<Data*> data;
RawData* raw_data;
std::thread* pthread;
thread_params(const bool use_kmeanspp, const int K, RawData* raw_data) : active(true), use_kmeanspp(use_kmeanspp), K(K), raw_data(raw_data), pthread(nullptr) {}
};
static void thread(thread_params* params)
{
params->data = Clustering::Model(params->use_kmeanspp, params->K, params->raw_data);
params->active = false;
}
static void FindBestK(CSVParser::CSVNumericData &csv_data, const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY, const int columnZ, const int columnW, int* const pK, float* const pKScore, unsigned int* const data_point_count, void** const data_points)
{
if (csv_data.row_count() > 0)
{
const int KMAX = pK ? (*pK > 24 ? 24 : *pK) : 14;
std::vector<std::vector<KMean::Data*>> modelData;
RawData* raw_data = new RawData(columnX, columnY, columnZ, columnW);
if (raw_data->Initialize(remove_outliers, csv_data))
{
if (multithreaded)
{
std::vector<thread_params*> threads;
for (int K = KMean::KMIN; K <= KMAX; K++)
{
for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
{
thread_params* params = new thread_params(use_kmeanspp, K, raw_data);
params->pthread = new std::thread(thread, params);
threads.push_back(params);
}
}
while (true)
{
int active_count = 0;
for (std::size_t i = 0; i < threads.size(); i++)
active_count += threads[i]->active;
if (!active_count)
break;
std::this_thread::sleep_for(std::chrono::milliseconds(100));
}
for (std::size_t i = 0; i < threads.size(); i++)
if (threads[i]->pthread->joinable())
threads[i]->pthread->join();
std::this_thread::sleep_for(std::chrono::seconds(1));
for (std::size_t i = 0; i < threads.size(); i++)
{
modelData.push_back(threads[i]->data);
delete threads[i]->pthread;
delete threads[i];
}
}
else
{
for (int K = KMean::KMIN; K <= KMAX; K++)
for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
modelData.push_back(Clustering::Model(use_kmeanspp, K, raw_data));
}
}
delete raw_data;
Clustering::FindBestModelAndDelete(modelData, pK, pKScore, data_point_count, data_points);
}
}
void FindBestKFromFile(const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const char* filename, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
{
srand(time(0)); // initialize rand()
std::fstream file;
file.open(filename, std::ios::in);
if (file.is_open())
{
CSVParser::CSVNumericData csv_data(file, first_row_column_names);
FindBestK(csv_data, remove_outliers, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
}
void FindBestKFromCSV(const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const char *csv_string, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
{
srand(time(0)); // initialize rand()
std::istringstream ss(csv_string);
CSVParser::CSVNumericData csv_data(ss, first_row_column_names);
FindBestK(csv_data, remove_outliers, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
void FreeMemory(void* memory_ptr)
{
delete[] (ClusteredPoint*)memory_ptr;
}
}
#endif
// hashtable.h
#include <iostream>
#include <iomanip>
#include <string>
#ifndef HASHTABLE_H
#define HASHTABLE_H
template <class TKey, class TValue> class Hashtable;
#define TABLE_SIZE 1000003 // use a large prime number
unsigned int hash_func(const std::string& key)
{
unsigned int h = 0;
unsigned int o = 31415;
const unsigned int t = 27183;
const char* k = key.c_str();
while (*k)
{
h = (o * h + *k++) % TABLE_SIZE;
o = o * t % (TABLE_SIZE - 1);
}
return h;
}
unsigned int hash_func(const unsigned int key)
{
// make sure table size is actually (TABLE_SIZE+1) (more about this below)
return(key & TABLE_SIZE);
}
template <class TKey, class TValue> class HashtableItem;
template <class TKey, class TValue> class HashtableDataMembers;
template <class TKey, class TValue> class HashtableDataMembers
{
public:
HashtableItem<TKey, TValue>** table;
HashtableItem<TKey, TValue>* cur_table_item;
unsigned int table_size, count;
HashtableDataMembers();
~HashtableDataMembers();
};
template <class TKey, class TValue> class Hashtable
{
private:
HashtableDataMembers<TKey, TValue>* dm;
public:
Hashtable();
Hashtable(const Hashtable& obj);
~Hashtable();
bool Add(const TKey& key, const TValue& value); // Add a new entry, returns false when the key already exists
const Hashtable<TKey, TValue>& operator=(const Hashtable<TKey, TValue>& obj);
HashtableItem<TKey, TValue>* operator[](const TKey& key) const;
void Remove(const TKey& key); // removes one table entry
void Clear(); // removes all the table entries
unsigned int Count(); // returns count of entries;
};
template <class TKey, class TValue> class HashtableItem
{
private:
HashtableItem<TKey, TValue>* pnext;
TKey key;
TValue value;
// keep these private to prevent the client from creating this object
HashtableItem() {}
HashtableItem(const TKey& key, const TValue& value);
~HashtableItem();
public:
const TKey& Key() const;
const TValue& Value() const;
const TValue& operator=(const TValue& value);
// some friend functions that can access the private data
friend bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value);
friend HashtableDataMembers<TKey, TValue>::HashtableDataMembers();
friend HashtableDataMembers<TKey, TValue>::~HashtableDataMembers();
friend void Hashtable<TKey, TValue>::Remove(const TKey& key);
friend const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj);
friend HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const;
friend void Hashtable<TKey, TValue>::Clear();
};
// ##################### class HashtableDataMembers ###################
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::HashtableDataMembers()
{
cur_table_item = nullptr;
count = 0;
table_size = TABLE_SIZE + 1; // add one to the TABLE_SIZE because the int hash_func can actually return TABLE_SIZE
table = new HashtableItem<TKey, TValue>* [table_size]();
for (unsigned int i = 0; i < table_size; table[i++] = nullptr);
}
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::~HashtableDataMembers()
{
for (unsigned int i = 0; i < table_size; i++)
{
if (table[i])
{
delete table[i];
table[i] = nullptr; // clean-up the memory
}
}
delete[] table;
}
// ##################### class Hashtable ###########################
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable()
{
dm = new HashtableDataMembers<TKey, TValue>();
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable(const Hashtable& obj)
{
dm = new HashtableDataMembers<TKey, TValue>();
this->operator=(obj);
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::~Hashtable()
{
delete dm;
dm = nullptr;
}
template <class TKey, class TValue> const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj)
{
if (this != &obj)
{
unsigned int i;
Clear();
if (dm->table_size != obj.dm->table_size)
{
if (dm->table)
delete[]dm->table;
dm->table_size = obj.dm->table_size;
dm->table = new HashtableItem<TKey, TValue>* [dm->table_size]();
for (i = 0; i < dm->table_size; dm->table[i++] = nullptr);
}
for (i = 0; i < obj.dm->table_size; i++)
{
HashtableItem<TKey, TValue>** ppobjnode, ** ppnode;
ppobjnode = &obj.dm->table[i];
ppnode = &dm->table[i];
while (*ppobjnode)
{
(*ppnode) = new HashtableItem<TKey, TValue>((*ppobjnode)->key, (*ppobjnode)->value);
if (obj.dm->cur_table_item == *ppobjnode)
dm->cur_table_item = *ppnode;
ppnode = &(*ppnode)->pnext;
ppobjnode = &(*ppobjnode)->pnext;
}
}
}
return (*this);
}
template <class TKey, class TValue> bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value)
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>** ppnode;
for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
if (*ppnode) // then found
return false;
(*ppnode) = new HashtableItem<TKey, TValue>(key, value);
dm->count++;
return true;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>* pnode;
for (pnode = dm->table[i]; pnode && (pnode->key != key); pnode = pnode->pnext);
return pnode;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Remove(const TKey& key)
{
unsigned int i = hash_func(key);
HashtableItem<TKey, TValue>** ppnode, * tmp;
for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
if (*ppnode)
{
tmp = (*ppnode);
(*ppnode) = (*ppnode)->pnext;
tmp->pnext = nullptr;
delete tmp;
}
dm->count--;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Clear()
{
for (unsigned int i = 0; i < dm->table_size; i++)
{
if (dm->table[i])
{
delete dm->table[i];
dm->table[i] = nullptr;
}
}
dm->count = 0;
}
template <class TKey, class TValue> unsigned int Hashtable<TKey, TValue>::Count()
{
return dm->count;
}
// ##################### class HashtableItem ###########################
template <class TKey, class TValue> HashtableItem<TKey, TValue>::HashtableItem(const TKey& xKey, const TValue& xValue)
{
key = xKey;
value = xValue;
pnext = nullptr;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>::~HashtableItem()
{
if (this->pnext)
{
delete this->pnext;
this->pnext = nullptr; // clean-up memory
}
}
template <class TKey, class TValue> const TKey& HashtableItem<TKey, TValue>::Key() const
{
return this->key;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::Value() const
{
return this->value;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::operator=(const TValue& value)
{
this->value = value;
return value;
}
#endif
Windows Specific Code
// dllmain.cpp
#include "kmean.h"
using namespace KMean;
#include <windows.h>
BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
{
switch (ul_reason_for_call)
{
case DLL_PROCESS_ATTACH:
case DLL_THREAD_ATTACH:
case DLL_THREAD_DETACH:
case DLL_PROCESS_DETACH:
break;
}
return TRUE;
}
extern "C"
{
void WinFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, const char* filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
{
FindBestKFromFile(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, filename, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
void WinFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, const char* csv_string, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
{
FindBestKFromCSV(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, csv_string, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
void* WinGetClusteredPointElement(unsigned int index, void* data_points)
{
return &((ClusteredPoint*)data_points)[index];
}
void WinFreeMemory(void* memory_ptr)
{
FreeMemory(memory_ptr);
}
}
LIBRARY KMeanSilhouetteDll
EXPORTS
WinFindBestKFromFile @1
WinFindBestKFromCSV @2
WinGetClusteredPointElement @3
WinFreeMemory @4
Linux Specific Code
// LINUX SPECIFIC SOURCE FILE: FOR SHARED LIBRARY FUNCTION EXPORTS
#include "kmean.h"
using namespace KMean;
extern "C"
{
__attribute__((visibility("default"))) void LinuxFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, const char* filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
{
FindBestKFromFile(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, filename, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
__attribute__((visibility("default"))) void LinuxFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, const char* csv_string, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
{
FindBestKFromCSV(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, csv_string, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
}
__attribute__((visibility("default"))) void* LinuxGetClusteredPointElement(unsigned int index, void* data_points)
{
return &((ClusteredPoint*)data_points)[index];
}
__attribute__((visibility("default"))) void LinuxFreeMemory(void* memory_ptr)
{
FreeMemory(memory_ptr);
}
}
C# Code
THIS C# CODE FILE USES A DLL ON WINDOWS OR SHARED LIBRARY ON LINUX COMPILED FROM THE CODE ABOVE.
// CppKMean.cs
using System.Collections.Generic;
using System.Runtime.InteropServices;
namespace Cpp.KMean
{
[StructLayout(LayoutKind.Sequential)]
public class ClusteredPoint
{
public float x { get; set; }
public float y { get; set; }
public float z { get; set; }
public float w { get; set; }
public int cid { get; set; }
}
public class Clustering
{
[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
private static extern void WinFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, string filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);
[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
private static extern void WinFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, string csv_data, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);
[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
private static extern IntPtr WinGetClusteredPointElement(uint index, IntPtr data_points);
[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
private static extern void WinFreeMemory(IntPtr memory_ptr);
[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
private static extern void LinuxFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, string filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);
[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
private static extern void LinuxFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, string csv_data, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);
[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
private static extern IntPtr LinuxGetClusteredPointElement(uint index, IntPtr data_points);
[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
private static extern void LinuxFreeMemory(IntPtr memory_ptr);
static public IEnumerable<ClusteredPoint> FindBestKFromFile(bool remove_outliers, bool use_kmeanspp, bool multithreaded, string filename, int attempts_to_improve_kscore, bool first_row_column_names, out int pK, out float pKScore, int columnX, int? columnY, int? columnZ, int? columnW)
{
if (!File.Exists(filename) || new FileInfo(filename).Length == 0)
{
pK = 0;
pKScore = float.MinValue;
return new List<ClusteredPoint>();
}
uint count;
IntPtr array;
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
LinuxFindBestKFromFile(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, filename, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
var point = new ClusteredPoint();
var points = new List<ClusteredPoint>();
for (uint i = 0; i < count; i++)
{
Marshal.PtrToStructure(LinuxGetClusteredPointElement(i, array), point);
points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
}
LinuxFreeMemory(array);
return points;
}
else if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
WinFindBestKFromFile(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, filename, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
var point = new ClusteredPoint();
var points = new List<ClusteredPoint>();
for (uint i = 0; i < count; i++)
{
Marshal.PtrToStructure(WinGetClusteredPointElement(i, array), point);
points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
}
WinFreeMemory(array);
return points;
}
else
{
pKScore = pK = 0;
return new List<ClusteredPoint>();
}
}
static public IEnumerable<ClusteredPoint> FindBestKFromCSV(bool remove_outliers, bool use_kmeanspp, bool multithreaded, string csv_data, int attempts_to_improve_kscore, bool first_row_column_names, out int pK, out float pKScore, int columnX, int? columnY, int? columnZ, int? columnW)
{
if (csv_data.Length == 0)
{
pK = 0;
pKScore = float.MinValue;
return new List<ClusteredPoint>();
}
uint count;
IntPtr array;
if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
LinuxFindBestKFromCSV(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, csv_data, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
var point = new ClusteredPoint();
var points = new List<ClusteredPoint>();
for (uint i = 0; i < count; i++)
{
Marshal.PtrToStructure(LinuxGetClusteredPointElement(i, array), point);
points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
}
LinuxFreeMemory(array);
return points;
}
else if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
WinFindBestKFromCSV(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, csv_data, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
var point = new ClusteredPoint();
var points = new List<ClusteredPoint>();
for (uint i = 0; i < count; i++)
{
Marshal.PtrToStructure(WinGetClusteredPointElement(i, array), point);
points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
}
WinFreeMemory(array);
return points;
}
else
{
pKScore = pK = 0;
return new List<ClusteredPoint>();
}
}
}
}