PROWAREtech

articles » current » c-plus-plus » machine-learning-unsupervised-clustering-with-k-mean-silhouette

C++: Machine Learning, Unsupervised Learning or Clustering, K-mean / Silhouette Clustering Library

A machine learning library that implements with k-means++ written in pure C++ for Windows and Linux.

Unsupervised learning is what we humans do all the time, and it refers to algorithms that learn patterns from unlabeled data. Supervised learning is a machine learning paradigm for problems where the available data consists of labeled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that maps feature vectors (inputs) to labels (output), based on example input-output pairs.

See this article for a supervised machine learning library written in C++.

K-mean Clustering & Silhouette

K-mean clustering is a method of vector quantization that aims to partition x number of observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster.

The first center points, called centroids, are randomly chosen and then the data is re-evaluated to determine the new centroid positions and so on until the centroid positions do not change.

Silhouette refers to a method of interpretation and validation of consistency within clusters of data. The technique provides a succinct graphical representation of how well each object has been classified.

This code can use pure pseudorandom centroid selection or it can use k-means++ centroid selection. It also has the option to remove data outliers.

Visit the playground for related.

TODO: Integrate with Nvidia CUDA for a boost in processing power!

This code is also available in C#, but it does take about twice as long to run.

Download all this code and more: KMEANCLUSTERING.zip

Example Usage


// main.cpp

#include "kmean.h"

int main()
{
	int K;
	float KScore;
	unsigned int count;
	KMean::ClusteredPoint* points;
	bool remove_outliers = true;
	bool use_kmeanspp = true;
	bool multithreaded = true;

	// can also copy an array of the data points (with cluster information) to be used for analysis
	KMean::FindBestKFromFile(remove_outliers, use_kmeanspp, multithreaded, "data.csv", 5, false, 0, 1, -1, -1, &K, &KScore, &count, (void**)&points);

	for(unsigned int i = 0; i < count; i++)
	{
		int cluster_id = points[i].cid;
		float x = points[i].x;
		float y = points[i].y;
		float z = points[i].z;
		float w = points[i].w;
	}

	KMean::FreeMemory(points);

	return 0;
}

Library Source Code


// kmean.h

#include <thread>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <time.h>
#include <limits>
#include <cmath>
#include <string>
#include <vector>
#include "csvparser.h"

#ifndef KMEAN_H

#define KMEAN_H

#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#define MAX(X, Y) (((X) < (Y)) ? (Y) : (X))

#define MIN_FLOAT (-std::numeric_limits<float>::max())
#define MAX_FLOAT (std::numeric_limits<float>::max())

namespace KMean
{
	const int KMIN = 2, KMAX = 14;

	struct Point
	{
		float x, y, z, w; // should be zero if not using the data
		Point()
		{
			x = y = z = w = 0;
		}
		Point(Point* pt)
		{
			x = pt->x;
			y = pt->y;
			z = pt->z;
			w = pt->w;
		}
		Point(float x, float y, float z, float w)
		{
			this->x = x;
			this->y = y;
			this->z = z;
			this->w = w;
		}
	};

	struct ClusteredPoint : public Point
	{
		int cid;
		ClusteredPoint() : cid(-1)
		{
			x = y = z = w = 0;
		}
		ClusteredPoint(Point* pt) : cid(-1)
		{
			x = pt->x;
			y = pt->y;
			z = pt->z;
			w = pt->w;
		}
		ClusteredPoint(float x, float y, float z, float w) : cid(-1)
		{
			this->x = x;
			this->y = y;
			this->z = z;
			this->w = w;
		}
	};

	struct DistancePoint : public Point
	{
		float distance;
		DistancePoint(Point* pt, float distance) : distance(distance)
		{
			x = pt->x;
			y = pt->y;
			z = pt->z;
			w = pt->w;
		}
	};

	namespace Euclidean
	{
		// Euclidean distance formula between four points: distance = square_root((x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1) + (w2 - w1) * (w2 - w1))
		float FindDistanceBetweenPoints(const Point* const p1, const Point* const p2)
		{
			return (float)sqrt((p2->x - p1->x) * (p2->x - p1->x) + (p2->y - p1->y) * (p2->y - p1->y) + (p2->z - p1->z) * (p2->z - p1->z) + (p2->w - p1->w) * (p2->w - p1->w));
		}
		double FindDistanceBetweenPointsSquared(const Point* const p1, const Point* const p2)
		{
			return ((double)p2->x - (double)p1->x) * ((double)p2->x - (double)p1->x) + ((double)p2->y - (double)p1->y) * ((double)p2->y - (double)p1->y) + ((double)p2->z - (double)p1->z) * ((double)p2->z - (double)p1->z) + ((double)p2->w - (double)p1->w) * ((double)p2->w - (double)p1->w);
		}
	}

	namespace Sorting
	{
		void qsort_distance_points(DistancePoint** const arr, const std::size_t size)
		{
			if (size <= 30)
			{
				// insertion sort algorithm
				for (std::size_t i = 1; i < size; i++) {
					DistancePoint* temp = arr[i];
					std::size_t j = i;
					while (j > 0 && temp->distance < arr[j - 1]->distance) {
						arr[j] = arr[j - 1];
						j--;
					}
					arr[j] = temp;
				}
			}
			else
			{
				// quick sort algorithm
				double pivot = arr[size / 2]->distance;
				DistancePoint** left = arr;
				DistancePoint** right = arr + size - 1;
				while (true) {
					while (left <= right && (*left)->distance < pivot) left++;
					while (left <= right && (*right)->distance > pivot) right--;
					if (left > right)
						break;
					DistancePoint* temp = *left;
					*left++ = *right;
					*right-- = temp;
				}
				qsort_distance_points(arr, right - arr + 1);
				qsort_distance_points(left, arr + size - left);
			}
		}
	}

	class Silhouette
	{
	private:

		static int FindNearestNeighboringClusterId(std::vector<Point*>& centroids, Point* centroid)
		{
			int id = -1;
			float min = std::numeric_limits<float>::max();
			std::size_t count = centroids.size();
			for (int i = 0; i < count; i++)
			{
				float dist = Euclidean::FindDistanceBetweenPoints(centroids[i], centroid);
				if (dist > 0 && dist <= min)
				{
					id = i;
					min = dist;
				}
			}
			return id;
		}

	public:
		// Silhouette Formula: S(i) = [b(i) - a(i)] / MAX(a(i), b(i))
		// a(i) = the average of the distance between point i and all other data points in its cluster
		// b(i) = the average of the distance between point i and all other data points outside its cluster
		static float CalculateKScore(std::vector<ClusteredPoint*>& points, std::vector<Point*>& centroids)
		{
			const std::size_t cluster_count = centroids.size();
			double a_total = 0;
			double b_total = 0;
			int a_count = 0;
			int b_count = 0;
			const std::size_t count = points.size();
			for (std::size_t cid = 0; cid < cluster_count; cid++)
			{
				for (int j = 0; j < count; j++)
				{
					if (points[j]->cid == cid)
					{
						a_count++;
						a_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
					}
					else if (points[j]->cid == FindNearestNeighboringClusterId(centroids, centroids[cid]))
					{
						b_count++;
						b_total += Euclidean::FindDistanceBetweenPoints(points[j], centroids[cid]);
					}
				}
			}
			float a, b;
			if (a_count == 0)
				a = 0;
			else
				a = a_total / a_count;
			if (b_count == 0)
				b = 0;
			else
				b = b_total / b_count;
			float max = MAX(a, b);
			if (max == 0)
				return MIN_FLOAT;
			return (b - a) / max;
		}

	};

	class RawData
	{
	private:
		const int columnX, columnY, columnZ, columnW;

	public:
		std::string labelX, labelY, labelZ, labelW;

		std::vector<Point*> points;

		RawData(const int colX, const int colY, const int colZ, const int colW) : columnX(colX), columnY(colY), columnZ(colZ), columnW(colW) {}

		bool Initialize(const int remove_outliers, CSVParser::CSVNumericData& parsedCSVData)
		{
			std::vector<std::string> columnNames = parsedCSVData.column_names();
			if (columnX < columnNames.size())
				labelX = columnNames[columnX];
			if (columnY >= 0 && columnY < columnNames.size())
				labelY = columnNames[columnY];
			if (columnZ >= 0 && columnZ < columnNames.size())
				labelZ = columnNames[columnZ];
			if (columnW >= 0 && columnW < columnNames.size())
				labelW = columnNames[columnW];
			std::size_t col_count;
			for (int row = 0; row < parsedCSVData.row_count(); row++)
			{
				col_count = parsedCSVData[row].size();
				if (columnX < col_count && (columnY < 0 || columnY < col_count) && (columnZ < 0 || columnZ < col_count) && (columnW < 0 || columnW < col_count))
				{
					Point* pt = new Point(
						parsedCSVData[row][columnX],
						columnY < 0 ? 0 : parsedCSVData[row][columnY],
						columnZ < 0 ? 0 : parsedCSVData[row][columnZ],
						columnW < 0 ? 0 : parsedCSVData[row][columnW]);
					points.push_back(pt);
				}
			}

			std::size_t count = points.size();
			if (count && remove_outliers) // use interquartile range to find outliers
			{
				DistancePoint** dp_array = new DistancePoint * [count];

				// find average distance of every point from each other point
				for (std::size_t i = 0; i < count; i++)
				{
					double total_distance = 0;
					for (std::size_t j = 0; j < count; j++)
						if (i != j)
							total_distance += Euclidean::FindDistanceBetweenPoints(points[j], points[i]);
					dp_array[i] = new DistancePoint(points[i], total_distance / count);
				}

				Sorting::qsort_distance_points(dp_array, count);

				DistancePoint* Q1 = dp_array[count / 4];
				DistancePoint* Q3 = dp_array[count / 2 + count / 4 + (count & 1)];
				double IQR = Q3->distance - Q1->distance;
				double upper_fence = Q3->distance + 1.5 * IQR;
				double lower_fence = Q1->distance - 1.5 * IQR;

				std::size_t len = count;
				for (std::size_t i = 0; i < len; )
				{
					if (dp_array[i]->distance < lower_fence || dp_array[i]->distance > upper_fence)
					{
						delete dp_array[i];
						dp_array[i] = nullptr;
						len--;
						for (std::size_t j = i; j < len; j++)
							dp_array[j] = dp_array[j + 1];
					}
					else
						i++;
				}

				for (Point* pt : points)
					delete pt;

				points.clear();

				for (std::size_t i = 0; i < len; i++)
				{
					points.push_back(new Point(dp_array[i]));
					delete dp_array[i];
				}
				delete[] dp_array;

				count = len;
			}

			return count > 0;
		}

		~RawData()
		{
			for (Point* p : points)
			{
				if (p)
					delete p;
			}
		}

	};

	class Data
	{
	private:
		float score;

		double GetNearestCentroidDistance(Point* pt, std::vector<Point*>& centroids)
		{
			double d, min_dist = std::numeric_limits<float>::max();
			std::size_t centroid_count = centroids.size();
			for (std::size_t i = 0; i < centroid_count; i++)
			{
				d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
				if (d < min_dist)
					min_dist = d;
			}
			return min_dist;
		}

		void kMeanspp()
		{
			std::size_t point_count = points.size();
			centroids.push_back(new Point(points[rand() % point_count]));
			double* distances = new double[point_count];
			double sum = 0;
			for (std::size_t index = 1; index < K; index++)
			{
				for (std::size_t i = 0; i < point_count; i++)
					sum += (distances[i] = GetNearestCentroidDistance(points[i], centroids));

				sum *= rand() / (float)RAND_MAX;
				for (std::size_t i = 0; i < point_count; i++)
				{
					sum -= distances[i];
					if (sum < 0)
					{
						centroids.push_back(new Point(points[i]));
						break;
					}
				}
			}
			delete[] distances;
		}

		static bool IsChosen(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count, const std::size_t index)
		{
			for (int i = 0; i < K; i++)
			{
				if (chosen_indexes_to_centroid[i] == count)
					break;
				if (chosen_indexes_to_centroid[i] == index)
					return true;
			}
			return false;
		}

		static std::size_t FindUnchosenRandomIndex(const std::size_t* const chosen_indexes_to_centroid, const int K, const std::size_t count)
		{
			std::size_t index = rand() % count;
			while (true)
			{
				while (IsChosen(chosen_indexes_to_centroid, K, count, index))
					index++;
				if (index < count)
					break;
				index = rand() % count;
			}
			return index;
		}

	public:
		const int K;

		std::vector<Point*> centroids;
		std::vector<ClusteredPoint*> points;

		float KScore()
		{
			if (score == MIN_FLOAT)
				score = Silhouette::CalculateKScore(points, centroids);

			return score;
		}

		Data(const int K) : score(MIN_FLOAT), K(K) {}

		bool Initialize(const bool use_kmeanspp, RawData* raw_data)
		{
			bool ret = false;
			if (raw_data && raw_data->points.size())
			{
				std::size_t count = raw_data->points.size();
				for (std::size_t i = 0; i < count; i++)
					points.push_back(new ClusteredPoint(raw_data->points[i]));

				if (use_kmeanspp) // use k-means++ method to find initial centroids
				{
					kMeanspp();
				}
				else // use total random method to choose centroids
				{
					std::size_t* const chosen_indexes_to_centroid = new std::size_t[K];

					for (int i = 0; i < K; i++)
						chosen_indexes_to_centroid[i] = count;

					for (int i = 0; i < K; i++)
					{
						chosen_indexes_to_centroid[i] = FindUnchosenRandomIndex(chosen_indexes_to_centroid, K, count);
						centroids.push_back(new Point(points[chosen_indexes_to_centroid[i]]));
					}

					delete[] chosen_indexes_to_centroid;
				}
				ret = true;
			}
			return ret;
		}

		~Data()
		{
			for (Point* c : centroids)
			{
				if (c)
					delete c;
			}
			for (ClusteredPoint* p : points)
			{
				if (p)
					delete p;
			}
		}
	};

	class Clustering
	{
	public:

		static std::vector<Data*> Model(const bool use_kmeanspp, const int K, RawData* raw_data)
		{
			std::vector<Data*> dataArray;
			for (int i = 0; i < K; i++)
			{
				Data* data = new Data(K);
				if (!data->Initialize(use_kmeanspp, raw_data))
					delete data;
				else
				{
					std::vector<Point*> temp_points;
					while (true)
					{
						for (ClusteredPoint* point : data->points)
							point->cid = GetNearestCentroidIndex(point, data->centroids);
						CopyPointArrays(data->centroids, temp_points);
						RecenterCentroids(data);
						if (PointArraysAreEqual(data->centroids, temp_points))
							break;
					}
					dataArray.push_back(data);
				}
			}
			return dataArray;
		}

		static void FindBestModelAndDelete(std::vector<std::vector<KMean::Data*>>& models, int* pK, float* pKScore, unsigned int* data_point_count, void** data_points)
		{
			Data *best = nullptr;
			float kscore = MIN_FLOAT;
			for (std::size_t i = 0; i < models.size(); i++)
			{
				for (std::size_t j = 0; j < models[i].size(); j++)
					if (models[i][j]->KScore() > kscore)
					{
						kscore = models[i][j]->KScore();
						best = models[i][j];
					}
			}

			if (best)
			{
				if (pKScore)
					*pKScore = best->KScore();
				if (pK)
					*pK = best->K;
				if (data_point_count)
				{
					std::size_t count = best->points.size();
					(*data_point_count) = (unsigned int)count;
					if (data_points)
					{
						ClusteredPoint* points = new ClusteredPoint[count];
						for (std::size_t i = 0; i < count; i++)
						{
							points[i].cid = best->points[i]->cid;
							points[i].x = best->points[i]->x;
							points[i].y = best->points[i]->y;
							points[i].z = best->points[i]->z;
							points[i].w = best->points[i]->w;
						}
						*data_points = points;
					}
				}
			}
			else
			{
				if (pKScore)
					*pKScore = MIN_FLOAT;
				if (pK)
					*pK = 0;
				if (data_point_count)
				{
					(*data_point_count) = 0;
					if (data_points)
						*data_points = nullptr;
				}
			}

			for (std::size_t i = 0; i < models.size(); i++)
			{
				for (std::size_t j = 0; j < models[i].size(); j++)
					delete models[i][j];
			}

		}

	private:

		static std::size_t GetNearestCentroidIndex(Point* pt, std::vector<Point*>& centroids)
		{
			double d, min_dist = std::numeric_limits<float>::max();
			std::size_t centroid_count = centroids.size();
			std::size_t min_index = centroid_count;
			for (std::size_t i = 0; i < centroid_count; i++)
			{
				d = Euclidean::FindDistanceBetweenPointsSquared(pt, centroids[i]);
				if (d < min_dist)
				{
					min_dist = d;
					min_index = i;
				}
			}
			return min_index;
		}

		static void CopyPointArrays(std::vector<Point*> &src, std::vector<Point*> &dst)
		{
			if (dst.size() < src.size())
			{
				for (std::size_t i = 0; i < src.size(); i++)
				{
					Point* pt = new Point(src[i]);
					pt->x = src[i]->x;
					pt->y = src[i]->y;
					pt->z = src[i]->z;
					pt->w = src[i]->w;
					dst.push_back(pt);
				}
			}
			else
			{
				for (std::size_t i = 0; i < src.size(); i++)
				{
					dst[i]->x = src[i]->x;
					dst[i]->y = src[i]->y;
					dst[i]->z = src[i]->z;
					dst[i]->w = src[i]->w;
				}
			}
		}

		static bool PointArraysAreEqual(std::vector<Point*> &points1, std::vector<Point*> &points2)
		{
			std::size_t count = points1.size();
			for (std::size_t i = 0; i < points1.size() && count > 0; i++)
			{
				for (std::size_t j = 0; j < points2.size() && count > 0; j++)
				{
					if (points2[j]->x == points1[i]->x && points2[j]->y == points1[i]->y && points2[j]->z == points1[i]->z && points2[j]->w == points1[i]->w)
						count--;
				}
			}
			return count == 0;
		}

		static void RecenterCentroids(Data* data)
		{
			const int K = data->K;
			for (int i = 0; i < K; i++)
			{
				int c = 0;
				double x_total = 0;
				double y_total = 0;
				double z_total = 0;
				double w_total = 0;
				for (ClusteredPoint* point : data->points)
				{
					if (point->cid == i)
					{
						c++;
						x_total += point->x;
						y_total += point->y;
						z_total += point->z;
						w_total += point->w;
					}
				}
				if (c == 0)
					data->centroids[i]->x = data->centroids[i]->y = data->centroids[i]->z = data->centroids[i]->w = 0;
				else
				{
					data->centroids[i]->x = x_total / c;
					data->centroids[i]->y = y_total / c;
					data->centroids[i]->z = z_total / c;
					data->centroids[i]->w = w_total / c;
				}
			}
		}
	};

	struct thread_params
	{
		int active;
		const bool use_kmeanspp;
		const int K;
		std::vector<Data*> data;
		RawData* raw_data;
		std::thread* pthread;
		thread_params(const bool use_kmeanspp, const int K, RawData* raw_data) : active(true), use_kmeanspp(use_kmeanspp), K(K), raw_data(raw_data), pthread(nullptr) {}
	};

	static void thread(thread_params* params)
	{
		params->data = Clustering::Model(params->use_kmeanspp, params->K, params->raw_data);
		params->active = false;
	}

	static void FindBestK(CSVParser::CSVNumericData &csv_data, const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY, const int columnZ, const int columnW, int* const pK, float* const pKScore, unsigned int* const data_point_count, void** const data_points)
	{
		if (csv_data.row_count() > 0)
		{
			std::vector<std::vector<KMean::Data*>> modelData;
			RawData* raw_data = new RawData(columnX, columnY, columnZ, columnW);
			if (raw_data->Initialize(remove_outliers, csv_data))
			{
				if (multithreaded)
				{
					std::vector<thread_params*> threads;
					for (int K = KMean::KMIN; K <= KMean::KMAX; K++)
					{
						for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
						{
							thread_params* params = new thread_params(use_kmeanspp, K, raw_data);
							params->pthread = new std::thread(thread, params);
							threads.push_back(params);
						}
					}
					while (true)
					{
						int active_count = 0;
						for (std::size_t i = 0; i < threads.size(); i++)
							active_count += threads[i]->active;
						if (!active_count)
							break;
						std::this_thread::sleep_for(std::chrono::milliseconds(100));
					}
					for (std::size_t i = 0; i < threads.size(); i++)
						if (threads[i]->pthread->joinable())
							threads[i]->pthread->join();
					std::this_thread::sleep_for(std::chrono::seconds(1));
					for (std::size_t i = 0; i < threads.size(); i++)
					{
						modelData.push_back(threads[i]->data);
						delete threads[i]->pthread;
						delete threads[i];
					}
				}
				else
				{
					for (int K = KMean::KMIN; K <= KMean::KMAX; K++)
						for (int i = 0; i < (attempts_to_improve_kscore ? attempts_to_improve_kscore : 1); i++)
							modelData.push_back(Clustering::Model(use_kmeanspp, K, raw_data));
				}
			}
			delete raw_data;
			Clustering::FindBestModelAndDelete(modelData, pK, pKScore, data_point_count, data_points);
		}
	}

	void FindBestKFromFile(const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const char* filename, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
	{
		srand(time(0)); // initialize rand()

		std::fstream file;
		file.open(filename, std::ios::in);
		if (file.is_open())
		{
			CSVParser::CSVNumericData csv_data(file, first_row_column_names);
			FindBestK(csv_data, remove_outliers, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
		}
	}

	void FindBestKFromCSV(const bool remove_outliers, const bool use_kmeanspp, const bool multithreaded, const char *csv_string, const int attempts_to_improve_kscore, const bool first_row_column_names, const int columnX, const int columnY = -1, const int columnZ = -1, const int columnW = -1, int* const pK = nullptr, float* const pKScore = nullptr, unsigned int* const data_point_count = nullptr, void** const data_points = nullptr)
	{
		srand(time(0)); // initialize rand()

		std::istringstream ss(csv_string);

		CSVParser::CSVNumericData csv_data(ss, first_row_column_names);
		FindBestK(csv_data, remove_outliers, use_kmeanspp, multithreaded, attempts_to_improve_kscore, first_row_column_names, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
	}

	void FreeMemory(void* memory_ptr)
	{
		delete[] (ClusteredPoint*)memory_ptr;
	}
}


#endif

// csvparser.h

#include <istream>
#include <string>
#include <vector>
#include "hashtable.h"

#ifndef CSVPARSER_H

#define CSVPARSER_H

namespace CSVParser
{
	enum CSVState
	{
		UnquotedField,
		QuotedField,
		QuotedQuote
	};

	class CSVNumericData
	{
	private:
		bool firstRowHeadings;
		std::vector<std::vector<float>> table;
		std::vector<std::string> col_names;
		Hashtable<std::string, float> strings;

		static bool isNumber(const std::string& s)
		{
			std::size_t char_pos(0);

			char_pos = s.find_first_not_of(' ');
			if (char_pos == s.size()) return false;

			// check the significand
			if (s[char_pos] == '+' || s[char_pos] == '-')
				++char_pos; // skip the sign if exist

			int n_nm, n_pt, n_exp, n_exp_sign;
			bool n_fail;
			for (n_nm = 0, n_pt = 0, n_exp = 0, n_exp_sign = 0, n_fail = false; !n_fail && (std::isdigit(s[char_pos]) || s[char_pos] == '.' || s[char_pos] == 'e' || s[char_pos] == 'E' || s[char_pos] == '+' || s[char_pos] == '-'); ++char_pos)
			{
				if (s[char_pos] == '.')
					++n_pt;
				else if (s[char_pos] == 'e' || s[char_pos] == 'E')
					++n_exp;
				else if (s[char_pos] == '+' || s[char_pos] == '-')
				{
					if (n_exp == 1)
						++n_exp_sign;
					else
						n_fail = true;
				}
				else
					++n_nm;
				n_fail = (n_fail || n_pt > 1 || n_exp > 1 || n_exp_sign > 1);
			}

			if (n_fail || n_nm < 1) // no more than one point, at least one digit
				return false;

			// skip the trailing whitespaces
			while (s[char_pos] == ' ' || s[char_pos] == '\r' || s[char_pos] == '\n')
				++char_pos;

			return char_pos == s.size();  // must reach the ending 0 of the string
		}

		float GetDataValue(const std::string& d)
		{
			if (isNumber(d))
				return std::stof(d);
			HashtableItem<std::string, float>* entry = strings[d];
			if (entry)
				return entry->Value();
			float f = strings.Count();
			strings.Add(d, f);
			return f;
		}

		std::vector<float> ConvertToFloatRow(const std::vector<std::string>& row)
		{
			std::vector<float> float_row;
			for (std::size_t i = 0; i < row.size(); i++)
				float_row.push_back(GetDataValue(row[i]));
			return float_row;
		}

		std::vector<std::string> ReadCSVRow(const std::string& row)
		{
			CSVState state = CSVState::UnquotedField;
			std::vector<std::string> fields {""};
			size_t i = 0; // index of the current field
			for (char c : row)
			{
				switch (state)
				{
				case CSVState::UnquotedField:
					switch (c)
					{
					case ',': // end of field
						fields.push_back(""); i++;
						break;
					case '"': state = CSVState::QuotedField;
						break;
					default:  fields[i].push_back(c);
						break;
					}
					break;
				case CSVState::QuotedField:
					switch (c)
					{
					case '"': state = CSVState::QuotedQuote;
						break;
					default:  fields[i].push_back(c);
						break;
					}
					break;
				case CSVState::QuotedQuote:
					switch (c)
					{
					case ',': // , after closing quote
						fields.push_back(""); i++;
						state = CSVState::UnquotedField;
						break;
					case '"': // "" -> "
						fields[i].push_back('"');
						state = CSVState::QuotedField;
						break;
					default:  // end of quote
						state = CSVState::UnquotedField;
						break;
					}
					break;
				}
			}
			return fields;
		}

		/// Read CSV file, Excel dialect. Accept "quoted fields ""with quotes"""
		void ReadCSV(std::istream& in, bool firstRowColumnNames)
		{
			std::vector<std::vector<std::string>> raw_table;
			std::string row;
			while (!in.eof())
			{
				std::getline(in, row);

				if (in.bad() || in.fail())
					break;

				if (row.length() == 0)
					continue;

				if (firstRowColumnNames)
				{
					firstRowColumnNames = false;
					col_names = ReadCSVRow(row);
				}
				else
					raw_table.push_back(ReadCSVRow(row));
			}
			std::size_t row_count = raw_table.size(), current_col = 0, current_row = 0;
			while(true)
			{
				std::vector<float> floats;
				if(current_col == 0)
					table.push_back(floats);
				if (current_col < raw_table[current_row].size())
					table[current_row].push_back(GetDataValue(raw_table[current_row][current_col]));
				if (current_row == row_count - 1)
				{
					current_col++;
					if (raw_table[current_row].size() == current_col)
						break;
				}
				current_row++;
				if (current_row == row_count)
					current_row = 0;
			}
			strings.Clear();
		}

	public:
		CSVNumericData(std::istream& csv_string, const bool firstRowColumnNames)
		{
			ReadCSV(csv_string, firstRowColumnNames);
		}
		std::vector<float> operator[](const std::size_t index) const
		{
			return table[index];
		}
		std::size_t row_count()
		{
			return table.size();
		}
		std::vector<std::string> column_names()
		{
			return col_names;
		}
	};

}

#endif

// hashtable.h

#include <iostream>
#include <iomanip>
#include <string>

#ifndef HASHTABLE_H

#define HASHTABLE_H

template <class TKey, class TValue> class Hashtable;

#define TABLE_SIZE 1000003 // use a large prime number

unsigned int hash_func(const std::string& key)
{
	unsigned int h = 0;
	unsigned int o = 31415;
	const unsigned int t = 27183;
	const char* k = key.c_str();
	while (*k)
	{
		h = (o * h + *k++) % TABLE_SIZE;
		o = o * t % (TABLE_SIZE - 1);
	}
	return h;
}

unsigned int hash_func(const unsigned int key)
{
	// make sure table size is actually (TABLE_SIZE+1) (more about this below)
	return(key & TABLE_SIZE);
}

template <class TKey, class TValue> class HashtableItem;
template <class TKey, class TValue> class HashtableDataMembers;

template <class TKey, class TValue> class HashtableDataMembers
{
public:
	HashtableItem<TKey, TValue>** table;
	HashtableItem<TKey, TValue>* cur_table_item;
	unsigned int table_size, count;
	HashtableDataMembers();
	~HashtableDataMembers();
};

template <class TKey, class TValue> class Hashtable
{
private:
	HashtableDataMembers<TKey, TValue>* dm;

public:
	Hashtable();
	Hashtable(const Hashtable& obj);
	~Hashtable();
	bool Add(const TKey& key, const TValue& value); // Add a new entry, returns false when the key already exists
	const Hashtable<TKey, TValue>& operator=(const Hashtable<TKey, TValue>& obj);
	HashtableItem<TKey, TValue>* operator[](const TKey& key) const;
	void Remove(const TKey& key); // removes one table entry
	void Clear(); // removes all the table entries
	unsigned int Count(); // returns count of entries;
};

template <class TKey, class TValue> class HashtableItem
{
private:
	HashtableItem<TKey, TValue>* pnext;
	TKey key;
	TValue value;

	// keep these private to prevent the client from creating this object
	HashtableItem() {}
	HashtableItem(const TKey& key, const TValue& value);
	~HashtableItem();

public:
	const TKey& Key() const;
	const TValue& Value() const;
	const TValue& operator=(const TValue& value);

	// some friend functions that can access the private data
	friend bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value);
	friend HashtableDataMembers<TKey, TValue>::HashtableDataMembers();
	friend HashtableDataMembers<TKey, TValue>::~HashtableDataMembers();
	friend void Hashtable<TKey, TValue>::Remove(const TKey& key);
	friend const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj);
	friend HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const;
	friend void Hashtable<TKey, TValue>::Clear();
};





// ##################### class HashtableDataMembers ###################
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::HashtableDataMembers()
{
	cur_table_item = nullptr;
	count = 0;
	table_size = TABLE_SIZE + 1; // add one to the TABLE_SIZE because the int hash_func can actually return TABLE_SIZE
	table = new HashtableItem<TKey, TValue>* [table_size]();
	for (unsigned int i = 0; i < table_size; table[i++] = nullptr);
}
template <class TKey, class TValue> HashtableDataMembers<TKey, TValue>::~HashtableDataMembers()
{
	for (unsigned int i = 0; i < table_size; i++)
	{
		if (table[i])
		{
			delete table[i];
			table[i] = nullptr; // clean-up the memory
		}
	}
	delete[] table;
}




// ##################### class Hashtable ###########################
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable()
{
	dm = new HashtableDataMembers<TKey, TValue>();
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::Hashtable(const Hashtable& obj)
{
	dm = new HashtableDataMembers<TKey, TValue>();
	this->operator=(obj);
}
template <class TKey, class TValue> Hashtable<TKey, TValue>::~Hashtable()
{
	delete dm;
	dm = nullptr;
}
template <class TKey, class TValue> const Hashtable<TKey, TValue>& Hashtable<TKey, TValue>::operator=(const Hashtable<TKey, TValue>& obj)
{
	if (this != &obj)
	{
		unsigned int i;
		Clear();
		if (dm->table_size != obj.dm->table_size)
		{
			if (dm->table)
				delete[]dm->table;
			dm->table_size = obj.dm->table_size;
			dm->table = new HashtableItem<TKey, TValue>* [dm->table_size]();
			for (i = 0; i < dm->table_size; dm->table[i++] = nullptr);
		}
		for (i = 0; i < obj.dm->table_size; i++)
		{
			HashtableItem<TKey, TValue>** ppobjnode, ** ppnode;
			ppobjnode = &obj.dm->table[i];
			ppnode = &dm->table[i];
			while (*ppobjnode)
			{
				(*ppnode) = new HashtableItem<TKey, TValue>((*ppobjnode)->key, (*ppobjnode)->value);
				if (obj.dm->cur_table_item == *ppobjnode)
					dm->cur_table_item = *ppnode;
				ppnode = &(*ppnode)->pnext;
				ppobjnode = &(*ppobjnode)->pnext;
			}
		}
	}
	return (*this);
}
template <class TKey, class TValue> bool Hashtable<TKey, TValue>::Add(const TKey& key, const TValue& value)
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>** ppnode;
	for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
	if (*ppnode) // then found
		return false;
	(*ppnode) = new HashtableItem<TKey, TValue>(key, value);
	dm->count++;
	return true;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>* Hashtable<TKey, TValue>::operator[](const TKey& key) const
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>* pnode;
	for (pnode = dm->table[i]; pnode && (pnode->key != key); pnode = pnode->pnext);
	return pnode;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Remove(const TKey& key)
{
	unsigned int i = hash_func(key);
	HashtableItem<TKey, TValue>** ppnode, * tmp;
	for (ppnode = &dm->table[i]; (*ppnode) && ((*ppnode)->key != key); ppnode = &(*ppnode)->pnext);
	if (*ppnode)
	{
		tmp = (*ppnode);
		(*ppnode) = (*ppnode)->pnext;
		tmp->pnext = nullptr;
		delete tmp;
	}
	dm->count--;
}
template <class TKey, class TValue> void Hashtable<TKey, TValue>::Clear()
{
	for (unsigned int i = 0; i < dm->table_size; i++)
	{
		if (dm->table[i])
		{
			delete dm->table[i];
			dm->table[i] = nullptr;
		}
	}
	dm->count = 0;
}
template <class TKey, class TValue> unsigned int Hashtable<TKey, TValue>::Count()
{
	return dm->count;
}




// ##################### class HashtableItem ###########################
template <class TKey, class TValue> HashtableItem<TKey, TValue>::HashtableItem(const TKey& xKey, const TValue& xValue)
{
	key = xKey;
	value = xValue;
	pnext = nullptr;
}
template <class TKey, class TValue> HashtableItem<TKey, TValue>::~HashtableItem()
{
	if (this->pnext)
	{
		delete this->pnext;
		this->pnext = nullptr; // clean-up memory
	}
}
template <class TKey, class TValue> const TKey& HashtableItem<TKey, TValue>::Key() const
{
	return this->key;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::Value() const
{
	return this->value;
}
template <class TKey, class TValue> const TValue& HashtableItem<TKey, TValue>::operator=(const TValue& value)
{
	this->value = value;
	return value;
}

#endif

Windows Specific Code


// dllmain.cpp

#include "kmean.h"
using namespace KMean;

#include <windows.h>

BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved)
{
	switch (ul_reason_for_call)
	{
	case DLL_PROCESS_ATTACH:
	case DLL_THREAD_ATTACH:
	case DLL_THREAD_DETACH:
	case DLL_PROCESS_DETACH:
		break;
	}
	return TRUE;
}

extern "C"
{
	void WinFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, const char* filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
	{
		FindBestKFromFile(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, filename, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
	}

	void WinFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, const char* csv_string, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
	{
		FindBestKFromCSV(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, csv_string, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
	}

	void* WinGetClusteredPointElement(unsigned int index, void* data_points)
	{
		return &((ClusteredPoint*)data_points)[index];
	}

	void WinFreeMemory(void* memory_ptr)
	{
		FreeMemory(memory_ptr);
	}
}

LIBRARY KMeanSilhouetteDll
EXPORTS
	WinFindBestKFromFile @1
	WinFindBestKFromCSV @2
	WinGetClusteredPointElement @3
	WinFreeMemory @4

Linux Specific Code


// LINUX SPECIFIC SOURCE FILE: FOR SHARED LIBRARY FUNCTION EXPORTS

#include "kmean.h"

using namespace KMean;

extern "C"
{
	__attribute__((visibility("default"))) void LinuxFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, const char* filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
	{
		FindBestKFromFile(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, filename, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
	}

	__attribute__((visibility("default"))) void LinuxFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, const char* csv_string, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY = -1, int columnZ = -1, int columnW = -1, int* pK = nullptr, float* pKScore = nullptr, unsigned int* data_point_count = nullptr, void** data_points = nullptr)
	{
		FindBestKFromCSV(remove_outliers != 0, use_kmeanspp != 0, multithreaded != 0, csv_string, attempts_to_improve_kscore, first_row_column_names != 0, columnX, columnY, columnZ, columnW, pK, pKScore, data_point_count, data_points);
	}

	__attribute__((visibility("default"))) void* LinuxGetClusteredPointElement(unsigned int index, void* data_points)
	{
		return &((ClusteredPoint*)data_points)[index];
	}

	__attribute__((visibility("default"))) void LinuxFreeMemory(void* memory_ptr)
	{
		FreeMemory(memory_ptr);
	}
}

C# Code

THIS C# CODE FILE USES A DLL ON WINDOWS OR SHARED LIBRARY ON LINUX COMPILED FROM THE CODE ABOVE.


// CppKMean.cs

using System.Collections.Generic;
using System.Runtime.InteropServices;

namespace Cpp.KMean
{
	[StructLayout(LayoutKind.Sequential)]
	public class ClusteredPoint
	{
		public float x { get; set; }

		public float y { get; set; }

		public float z { get; set; }

		public float w { get; set; }

		public int cid { get; set; }
	}

	public class Clustering
	{
		[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
		private static extern void WinFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, string filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);

		[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
		private static extern void WinFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, string csv_data, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);

		[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
		private static extern IntPtr WinGetClusteredPointElement(uint index, IntPtr data_points);

		[DllImport("KMeanClustering.dll", CallingConvention = CallingConvention.Cdecl)]
		private static extern void WinFreeMemory(IntPtr memory_ptr);

		[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
		private static extern void LinuxFindBestKFromFile(int remove_outliers, int use_kmeanspp, int multithreaded, string filename, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);

		[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
		private static extern void LinuxFindBestKFromCSV(int remove_outliers, int use_kmeanspp, int multithreaded, string csv_data, int attempts_to_improve_kscore, int first_row_column_names, int columnX, int columnY, int columnZ, int columnW, out int pK, out float pKScore, out uint data_point_count, out IntPtr data_points);

		[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
		private static extern IntPtr LinuxGetClusteredPointElement(uint index, IntPtr data_points);

		[DllImport("libKMeanClustering.so", CallingConvention = CallingConvention.Cdecl)]
		private static extern void LinuxFreeMemory(IntPtr memory_ptr);

		static public IEnumerable<ClusteredPoint> FindBestKFromFile(bool remove_outliers, bool use_kmeanspp, bool multithreaded, string filename, int attempts_to_improve_kscore, bool first_row_column_names, out int pK, out float pKScore, int columnX, int? columnY, int? columnZ, int? columnW)
		{
			if (!File.Exists(filename) || new FileInfo(filename).Length == 0)
			{
				pK = 0;
				pKScore = float.MinValue;
				return new List<ClusteredPoint>();
			}
			uint count;
			IntPtr array;
			if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
			{
				LinuxFindBestKFromFile(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, filename, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
				var point = new ClusteredPoint();
				var points = new List<ClusteredPoint>();
				for (uint i = 0; i < count; i++)
				{
					Marshal.PtrToStructure(LinuxGetClusteredPointElement(i, array), point);
					points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
				}
				LinuxFreeMemory(array);
				return points;
			}
            else if(RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
            {
				WinFindBestKFromFile(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, filename, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
				var point = new ClusteredPoint();
				var points = new List<ClusteredPoint>();
				for (uint i = 0; i < count; i++)
				{
					Marshal.PtrToStructure(WinGetClusteredPointElement(i, array), point);
					points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
				}
				WinFreeMemory(array);
				return points;
			}
			else
			{
				pKScore = pK = 0;
				return new List<ClusteredPoint>();
			}
		}

		static public IEnumerable<ClusteredPoint> FindBestKFromCSV(bool remove_outliers, bool use_kmeanspp, bool multithreaded, string csv_data, int attempts_to_improve_kscore, bool first_row_column_names, out int pK, out float pKScore, int columnX, int? columnY, int? columnZ, int? columnW)
		{
			if (csv_data.Length == 0)
			{
				pK = 0;
				pKScore = float.MinValue;
				return new List<ClusteredPoint>();
			}
			uint count;
			IntPtr array;
			if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
			{
				LinuxFindBestKFromCSV(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, csv_data, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
				var point = new ClusteredPoint();
				var points = new List<ClusteredPoint>();
				for (uint i = 0; i < count; i++)
				{
					Marshal.PtrToStructure(LinuxGetClusteredPointElement(i, array), point);
					points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
				}
				LinuxFreeMemory(array);
				return points;
			}
			else if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
			{
				WinFindBestKFromCSV(remove_outliers ? 1 : 0, use_kmeanspp ? 1 : 0, multithreaded ? 1 : 0, csv_data, attempts_to_improve_kscore, first_row_column_names ? 1 : 0, columnX, columnY ?? -1, columnZ ?? -1, columnW ?? -1, out pK, out pKScore, out count, out array);
				var point = new ClusteredPoint();
				var points = new List<ClusteredPoint>();
				for (uint i = 0; i < count; i++)
				{
					Marshal.PtrToStructure(WinGetClusteredPointElement(i, array), point);
					points.Add(new ClusteredPoint { cid = point.cid, x = point.x, y = point.y, z = point.z, w = point.w });
				}
				WinFreeMemory(array);
				return points;
			}
			else
			{
				pKScore = pK = 0;
				return new List<ClusteredPoint>();
			}
		}

	}
}

This site uses cookies. Cookies are simple text files stored on the user's computer. They are used for adding features and security to this site. Read the privacy policy.
CLOSE