PROWAREtech

articles » current » dot-net » neural-network-supervised-machine-learning

.NET: Neural Network, Supervised Deep Machine Learning Example in C#

An example neural network, deep learning library written in C#; works with practically any data as long as it is properly trained.

Supervised learning is a machine learning paradigm for problems where the available data consists of labeled examples, meaning that each data point contains features (covariates) and an associated label. The goal of supervised learning algorithms is learning a function that maps feature vectors (inputs) to labels (output), based on example input-output pairs.

See unsupervised learning version. Also, see convolutional neural network example.

Learn about feedforward neural networks.

See this code in action here and here.

Download these files including how to train the network and the MNIST image files of hand-written digits with their labels: NEURALNETWORKMNIST.zip. Experiment with the number of neurons and layers. This example usage code requires SixLabors.ImageSharp (NuGet package).

The neural network code:


// NeuralNetwork.cs
// Compatible with .NET Core 3.1 and later
using System;
using System.Collections.Generic;
using System.Threading;

namespace ML
{
	public class NeuralNetwork
	{
		public List<Matrix> Weights { get; set; }

		public List<Matrix>? Biases { get; set; }

		public int LayerCount { get; set; }

		public double ClipThreshold { get; set; }

		public string ActivationName { get; set; }

		public NeuralNetwork()
		{
			Weights = new List<Matrix>();
			ActivationName = string.Empty;
		}

		// clipThreshold may be needed when working with lots of data such as with somewhat large image recognition with a CNN, for example
		public NeuralNetwork(int[] neuronCounts, IActivationMethods activationObject, Randomization weightsBiasesInit, double clipThreshold = 0, bool biases = true)
		{
			if (biases)
			{
				Biases = new List<Matrix>();

				for (int i = 1; i < neuronCounts.Length; i++)
					Biases.Add(new Matrix(neuronCounts[i], 1));
			}

			Weights = new List<Matrix>();
			for (int i = 0; i < neuronCounts.Length - 1; i++)
				Weights.Add(new Matrix(neuronCounts[i + 1], neuronCounts[i]));

			LayerCount = neuronCounts.Length;

			ClipThreshold = clipThreshold;

			ActivationName = activationObject.ActivationName;

			Randomize(weightsBiasesInit);
		}

		// lambda is for the L2 regularization term, and should be a very small fraction (between zero and one) to help prevent overfitting and exploding gradients
		// at zero, it provides no regularization and risks exploding gradients, use a clipThreshold, such as 5.0
		// training might be slowed with multiple threads because the batch of training data of each thread is smaller and therefore has less to learn from; consider decreasing the number of threads as the epoch count increases and experimenting with the learning rate
		// learningRate can decrease by using an algorithm such as: 0.1 ^ (epoch / (double)epochCount) * initialLearningRate
		public void Train(List<Matrix> givenInputs, List<Matrix> desiredOutputs, IActivationMethods activationObject, ICostMethods costMethods, double learningRate, double lambda, int threadCount = 0)
		{
			if (ActivationName != activationObject.ActivationName)
				throw new ArgumentException($"activationObject argument incorrect: {activationObject.ActivationName}");

			if (givenInputs.Count != desiredOutputs.Count)
				throw new ArgumentException("\"givenInputs\" count must match \"desiredOutputs\" count.");

			if (threadCount < 1)
				threadCount = Environment.ProcessorCount;
			int mini_batch_size = givenInputs.Count / threadCount;
			if (mini_batch_size > 0)
			{
				var bps = new List<BatchParams>();
				for (int x = 0; x < threadCount; x++)
				{
					var bp = new BatchParams()
					{
						network = this,
						costMethods = costMethods,
						givenInputs = givenInputs.GetRange(x * mini_batch_size, mini_batch_size),
						desiredOutputs = desiredOutputs.GetRange(x * mini_batch_size, mini_batch_size),
						local_weights = new List<Matrix>(),
						local_biases = Biases == null ? null : new List<Matrix>(),
						activationObject = activationObject,
						learningRate = learningRate,
						lambda = lambda,
						delta_gradient_w = new Matrix[Weights.Count],
						delta_gradient_b = Biases == null ? null : new Matrix[Biases.Count],
						thread = new Thread(TrainMiniBatch) { IsBackground = true },
						threadCount = threadCount
					};

					foreach (var weight in Weights)
						bp.local_weights.Add(new Matrix(weight));

					if(Biases != null)
						foreach (var bias in Biases)
							bp.local_biases.Add(new Matrix(bias));

					if (x == threadCount - 1)
						TrainMiniBatch(bp);
					else
					{
						bps.Add(bp);
						bp.thread.Start(bp);
					}
				}
				while (true)
				{
					int x = 0;
					for (int i = 0; i < bps.Count; i++)
					{
						if (bps[i].thread == null)
							x++;
						else
						{
							if (bps[i].thread.IsAlive)
								Thread.Sleep(100);
							else
							{
								bps[i].thread = null;
								x++;
							}
						}
					}
					if (x == bps.Count)
						break;
				}

				for (int x = 0; x < Weights.Count; x++)
				{
					var weights = new List<Matrix>();
					for (int i = 0; i < bps.Count; i++)
						weights.Add(bps[i].local_weights[x]);
					ParameterAveraging(Weights[x], weights);
				}

				if (Biases != null)
				{
					for (int x = 0; x < Biases.Count; x++)
					{
						var biases = new List<Matrix>();
						for (int i = 0; i < bps.Count; i++)
							biases.Add(bps[i].local_biases[x]);
						ParameterAveraging(Biases[x], biases);
					}
				}
			}
			for (int x = threadCount * mini_batch_size; x < givenInputs.Count; x++)
				Train(givenInputs[x], desiredOutputs[x], activationObject, costMethods, learningRate, lambda);
		}

		// lambda is for the L2 regularization term, and should be a very small fraction (between zero and one) to help prevent overfitting and exploding gradients
		// at zero, it provides no regularization and risks exploding gradients, use a clipThreshold, such as 5.0
		// learningRate can decrease by using an algorithm such as: 0.1 ^ (epoch / (double)epochCount) * initialLearningRate
		public void Train(Matrix givenInputs, Matrix desiredOutputs, IActivationMethods activationObject, ICostMethods costMethods, double learningRate, double lambda)
		{
			if (ActivationName != activationObject.ActivationName)
				throw new ArgumentException($"activationObject argument incorrect: {activationObject.ActivationName}");

			var delta_gradient_w = new Matrix[Weights.Count];
			var delta_gradient_b = Biases == null ? null : new Matrix[Biases.Count];

			BackPropagate(this, Weights, Biases, givenInputs, desiredOutputs, activationObject, costMethods, delta_gradient_w, delta_gradient_b);

			var new_weights = new List<Matrix>();
			var new_biases = Biases == null ? null : new List<Matrix>();

			for (int i = 0; i < delta_gradient_w.Length; i++)
			{
				for (int j = 0; j < delta_gradient_w[i].rows; j++)
					for (int k = 0; k < delta_gradient_w[i].columns; k++)
					{
						double w = Weights[i].GetValue(j, k);
						double nw = delta_gradient_w[i].GetValue(j, k);
						delta_gradient_w[i].SetValue(j, k, (1 - learningRate * lambda) * w - learningRate * nw);
					}
				new_weights.Add(delta_gradient_w[i]);
			}

			if (delta_gradient_b != null)
			{
				for (int i = 0; i < delta_gradient_b.Length; i++)
				{
					for (int j = 0; j < delta_gradient_b[i].rows; j++)
					{
						double b = Biases[i].GetValue(j, 0);
						double nb = delta_gradient_b[i].GetValue(j, 0);
						delta_gradient_b[i].SetValue(j, 0, b - learningRate * nb);
					}
					new_biases.Add(delta_gradient_b[i]);
				}
			}

			Weights = new_weights;
			Biases = new_biases;

		}

		public double Calculate(Matrix givenInputs, Matrix desiredOutputs, IActivationMethods activationObject, ICostMethods costMethods)
		{
			if (givenInputs.rows != desiredOutputs.rows && givenInputs.columns != desiredOutputs.columns)
				throw new ArgumentException("The givenInputs and desiredOutputs arguments are not of the same dimensions.");

			return costMethods.LossFunction(FeedForward(givenInputs, activationObject), desiredOutputs); // otherwise, use Categorical Cross Entropy function to calculate loss
		}

		#region Private_Decl
		private static void TrainMiniBatch(object? o)
		{
			BatchParams bp = (BatchParams)o;

			for (int x = 0; x < bp.givenInputs.Count; x++)
			{
				BackPropagate(bp.network, bp.local_weights, bp.local_biases, bp.givenInputs[x], bp.desiredOutputs[x], bp.activationObject, bp.costMethods, bp.delta_gradient_w, bp.delta_gradient_b);

				for (int i = 0; i < bp.delta_gradient_w.Length; i++)
				{
					for (int row = 0; row < bp.delta_gradient_w[i].rows; row++)
						for (int column = 0; column < bp.delta_gradient_w[i].columns; column++)
						{
							double w = bp.local_weights[i].GetValue(row, column);
							double nw = bp.delta_gradient_w[i].GetValue(row, column);
							bp.local_weights[i].SetValue(row, column, (1 - bp.learningRate * bp.lambda) * w - (bp.learningRate / bp.threadCount) * nw);
						}
				}

				if (bp.delta_gradient_b != null)
				{
					for (int i = 0; i < bp.delta_gradient_b.Length; i++)
					{
						for (int row = 0; row < bp.delta_gradient_b[i].rows; row++)
						{
							double b = bp.local_biases[i].GetValue(row, 0);
							double nb = bp.delta_gradient_b[i].GetValue(row, 0);
							bp.local_biases[i].SetValue(row, 0, b - (bp.learningRate / bp.threadCount) * nb);
						}
					}
				}
			}
		}

		private static void ParameterAveraging(Matrix globalParameters, List<Matrix> localParametersOfThreads)
		{
			// Initialize a temporary matrix to hold the sum of local parameters
			Matrix sumOfLocalParams = new Matrix(globalParameters.rows, globalParameters.columns);
			for (int threadId = 0; threadId < localParametersOfThreads.Count; threadId++)
				for (int row = 0; row < localParametersOfThreads[threadId].rows; row++)
					for (int column = 0; column < localParametersOfThreads[threadId].columns; column++)
						sumOfLocalParams.SetValue(row, column, sumOfLocalParams.GetValue(row, column) + localParametersOfThreads[threadId].GetValue(row, column));
			// Update the global parameter matrix using parameter averaging formula
			for (int row = 0; row < globalParameters.rows; ++row)
				for (int column = 0; column < globalParameters.columns; ++column)
					globalParameters.SetValue(row, column, sumOfLocalParams.GetValue(row, column) / localParametersOfThreads.Count);
		}

		private static void BackPropagate(NeuralNetwork network, List<Matrix> Weights, List<Matrix>? Biases, Matrix givenInputs, Matrix desiredOutputs, IActivationMethods activationObject, ICostMethods costMethods, Matrix[] delta_gradient_w, Matrix[]? delta_gradient_b) // uses Stochastic Gradient Descent
		{
			Matrix activation = givenInputs;

			List<Matrix> activations = new List<Matrix> { activation };

			List<Matrix> zs = new List<Matrix>();

			// feed forward
			for (int i = 0; i < network.LayerCount - 1; i++)
			{
				Matrix? z;
				Matrix.Multiply(Weights[i], activation, out z);
				if (z == null)
					throw new ArgumentException("Cannot multiply matrices.");

				if (Biases != null) // add bias
				{
					for (int j = 0; j < z.rows; j++)
						for (int k = 0; k < z.columns; k++)
							z.SetValue(j, k, z.GetValue(j, k) + Biases[i].GetValue(j, 0));
				}

				zs.Add(new Matrix(z));

				if (i < network.LayerCount - 2)
					activationObject.ActivationMethod(z);
				else
					activationObject.OutputActivationMethod(z);
				activation = z;
				activations.Add(activation);
			}

			// backward pass

			Matrix delta = new Matrix(activations[^1].rows, activations[^1].columns);

			costMethods.Delta(zs[^1], activations[^1], desiredOutputs, delta);

			if(delta_gradient_b != null)
				delta_gradient_b[^1] = new Matrix(delta);

			Matrix transposed = new Matrix(activations[^2]);
			transposed.Transpose();

			Matrix? temp;
			Matrix.Multiply(delta, transposed, out temp);
			if (temp == null)
				throw new ArgumentException("Cannot multiply matrices.");
			delta_gradient_w[^1] = temp;

			for (int i = 2; i < network.LayerCount; i++)
			{
				var t = network.LayerCount - i;
				transposed = new Matrix(Weights[t]);
				transposed.Transpose();

				Matrix.Multiply(transposed, delta, out temp);
				if (temp == null)
					throw new ArgumentException("Cannot multiply matrices.");

				// multiply the derivative function on "temp"
				Matrix z = zs[^i];
				for (int j = 0; j < temp.rows; j++)
					for (int k = 0; k < temp.columns; k++)
						temp.SetValue(j, k, temp.GetValue(j, k) * activationObject.Derivative(z.GetValue(j, 0)));

				delta.Copy(temp);

				if (delta_gradient_b != null)
					delta_gradient_b[^i] = temp;

				t = network.LayerCount - i - 1;
				transposed = new Matrix(activations[t]);
				transposed.Transpose();
				Matrix.Multiply(delta, transposed, out temp);

				if (temp == null)
					throw new ArgumentException("Cannot multiply matrices.");
				delta_gradient_w[^i] = temp;
			}
			if (network.ClipThreshold > 0) // if greater than zero then will take care of exploding gradients
			{
				double gradients_norm, scale_factor;

				if (delta_gradient_b != null) // biases
				{
					gradients_norm = 0;
					for (int i = 0; i < delta_gradient_b.Length; i++)
						for (int j = 0; j < delta_gradient_b[i].rows; j++)
							for (int k = 0; k < delta_gradient_b[i].columns; k++)
								gradients_norm += delta_gradient_b[i].GetValue(j, k) * delta_gradient_b[i].GetValue(j, k);
					gradients_norm = Math.Sqrt(gradients_norm);
					if (gradients_norm > network.ClipThreshold)
					{
						scale_factor = network.ClipThreshold / gradients_norm;
						for (int i = 0; i < delta_gradient_b.Length; i++)
							for (int j = 0; j < delta_gradient_b[i].rows; j++)
								for (int k = 0; k < delta_gradient_b[i].columns; k++)
									delta_gradient_b[i].SetValue(j, k, delta_gradient_b[i].GetValue(j, k) * scale_factor);
					}
				}

				// weights
				gradients_norm = 0;
				for (int i = 0; i < delta_gradient_w.Length; i++)
					for (int j = 0; j < delta_gradient_w[i].rows; j++)
						for (int k = 0; k < delta_gradient_w[i].columns; k++)
							gradients_norm += delta_gradient_w[i].GetValue(j, k) * delta_gradient_w[i].GetValue(j, k);
				gradients_norm = Math.Sqrt(gradients_norm);
				if (gradients_norm > network.ClipThreshold)
				{
					scale_factor = network.ClipThreshold / gradients_norm;
					for (int i = 0; i < delta_gradient_w.Length; i++)
						for (int j = 0; j < delta_gradient_w[i].rows; j++)
							for (int k = 0; k < delta_gradient_w[i].columns; k++)
								delta_gradient_w[i].SetValue(j, k, delta_gradient_w[i].GetValue(j, k) * scale_factor);
				}
			}
		}

		private Matrix FeedForward(Matrix givenInputs, IActivationMethods activationObject)
		{
			if (ActivationName != activationObject.ActivationName)
				throw new ArgumentException($"activationObject argument incorrect: {activationObject.ActivationName}");

			for (int i = 0; i < LayerCount - 1; i++)
			{
				Matrix? temp;
				Matrix.Multiply(Weights[i], givenInputs, out temp);
				if (temp == null)
					throw new ArgumentException("Cannot multiply matrices.");

				if (Biases != null) // add bias
				{
					for (int j = 0; j < temp.rows; j++)
						for (int k = 0; k < temp.columns; k++)
							temp.SetValue(j, k, temp.GetValue(j, k) + Biases[i].GetValue(j, 0));
				}

				if (i < LayerCount - 2)
					activationObject.ActivationMethod(temp);
				else
					activationObject.OutputActivationMethod(temp);
				givenInputs = temp;
			}
			return givenInputs;
		}

		private void Randomize(Randomization initialization)
		{
			var rand = new Random();
			for (int a = 0; a < Weights.Count; a++)
			{
				double init;
				switch (initialization)
				{
					case Randomization.HeNormal: // good for large ReLU networks
						init = Math.Sqrt(2.0 / Weights[a].columns);
						break;
					case Randomization.GlorotXavier: // for general use
						init = Math.Sqrt(6.0 / (Weights[a].columns + Weights[a].rows));
						break;
					default:
						init = 1;
						break;
				}
				for (int i = 0; i < Weights[a].rows; i++)
					for (int j = 0; j < Weights[a].columns; j++)
						Weights[a].SetValue(i, j, rand.NextDouble() * init - init * 0.5);
			}
			if (Biases != null)
			{
				for (int a = 0; a < Biases.Count; a++)
				{
					double init;
					switch (initialization)
					{
						case Randomization.HeNormal: // good for large ReLU networks
							init = Math.Sqrt(2.0 / Biases[a].columns);
							break;
						case Randomization.GlorotXavier: // for general use
							init = Math.Sqrt(6.0 / (Biases[a].columns + Biases[a].rows));
							break;
						default:
							init = 1;
							break;
					}
					for (int i = 0; i < Biases[a].rows; i++)
						Biases[a].SetValue(i, 0, rand.NextDouble() * init - init * 0.5);
				}
			}
		}

		public string ToJson()
		{
			return System.Text.Json.JsonSerializer.Serialize(this);
		}

		public static NeuralNetwork? FromJson(string json)
		{
			return System.Text.Json.JsonSerializer.Deserialize<NeuralNetwork>(json);
		}
		#endregion

	}
	public enum Randomization // play with learning rate when switching between these randomizations
	{
		HeNormal, // good for ReLU, Leaky ReLU and ELU when used on large neural networks
		GlorotXavier // good for smaller networks or those using tanh or Sigmoid activation; results in faster learning
	}
	internal class BatchParams
	{
		public NeuralNetwork network;
		public ICostMethods costMethods;
		public List<Matrix> givenInputs, desiredOutputs, local_weights;
		public List<Matrix>? local_biases;
		public IActivationMethods activationObject;
		public Matrix[] delta_gradient_w;
		public Matrix[]? delta_gradient_b;
		public double learningRate;
		public double lambda;
		public Thread? thread;
		public int threadCount;
	}

	public class Matrix
	{
		public double[] data { get; set; }
		public int rows { get; set; }
		public int columns { get; set; }
		public Matrix()
		{
			data = new double[0];
		}
		public Matrix(int rows, int columns)
		{
			this.rows = rows;
			this.columns = columns;
			data = new double[rows * columns];
		}
		public Matrix(int rows, int columns, double[] values, bool transpose = false)
		{
			this.rows = rows;
			this.columns = columns;
			data = new double[rows * columns];
			if (transpose)
			{
				for (var c = 0; c < columns; c++)
					for (var r = 0; r < rows; r++)
						SetValue(c, r, values[r * columns + c]);
			}
			else
				Array.Copy(values, data, rows * columns);
		}
		public Matrix(Matrix m)
		{
			rows = m.rows;
			columns = m.columns;
			data = new double[rows * columns];
			Array.Copy(m.data, data, rows * columns);
		}
		public void Transpose()
		{
			var result = new Matrix(columns, rows);
			for (var c = 0; c < columns; c++)
				for (var r = 0; r < rows; r++)
					result.SetValue(c, r, GetValue(r, c));
			Copy(result);
		}
		public static void Add(Matrix M, double V)
		{
			for (int i = 0; i < M.rows; i++)
				for (int j = 0; j < M.columns; j++)
					M.SetValue(i, j, M.GetValue(i, j) + V);
		}
		public static void Multiply(Matrix A, Matrix B, out Matrix? C) // this is part of the dot product
		{
			C = null;
			if (A.columns == B.rows)
			{
				int m = A.rows, p = B.columns, n = A.columns;
				C = new Matrix(m, p);
				for (int i = 0; i < C.rows; i++)
					for (int j = 0; j < C.columns; j++)
					{
						for (int k = 0; k < n; k++)
						{
							double d = A.GetValue(i, k) * B.GetValue(k, j);
							double v = C.GetValue(i, j);
							C.SetValue(i, j, v + d);
						}
					}
			}
		}
		public void Dropout(double dropoutRate) // apply a dropout to the matrix
		{
			var rand = new Random();
			for (int i = 0; i < rows; i++)
				for (int j = 0; j < columns; j++)
					if (rand.NextDouble() < dropoutRate)
						SetValue(i, j, 0.0);
		}
		public void SetValuesAndTranspose(int rows, int columns, double[] values)
		{
			this.rows = rows;
			this.columns = columns;
			data = new double[rows * columns];
			for (var c = 0; c < columns; c++)
				for (var r = 0; r < rows; r++)
					SetValue(c, r, values[r * columns + c]);
		}
		public double GetValue(int row, int column)
		{
			return data[row * columns + column];
		}
		public void SetValue(int row, int column, double value)
		{
			data[row * columns + column] = value;
		}
		public void SetValues(int rows, int columns, double[] values)
		{
			if (rows * columns != this.rows * this.columns)
				data = new double[rows * columns];
			this.rows = rows;
			this.columns = columns;
			Array.Copy(values, data, rows * columns);
		}
		public void Copy(Matrix m)
		{
			if (rows * columns != m.rows * m.columns)
				data = new double[m.rows * m.columns];
			rows = m.rows;
			columns = m.columns;
			Array.Copy(m.data, data, rows * columns);
		}
		public void Clear()
		{
			rows = columns = 0;
			data = new double[0];
		}
		public void Print()
		{
			for (int i = 0; i < rows; i++)
			{
				for (int j = 0; j < columns; j++)
				{
					Console.Write(GetValue(i, j) + " ");
				}
				Console.WriteLine();
			}
			Console.WriteLine();
		}
	}
	public interface IActivationMethods
	{
		public void ActivationMethod(Matrix outputs);
		public void OutputActivationMethod(Matrix outputs);
		public double Derivative(double input);
		public void OutputDerivative(Matrix z, Matrix derivatives);
		public string ActivationName { get; }
	}
	public class ActivationReLUSoftmax : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Rectified Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.ReLU(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			Functions.Softmax(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.ReLUPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives) // this is the Softmax derivative and this may not work with the Quadratic Cost function
		{
			double d;
			for (int i = 0; i < z.rows; i++)
			{
				d = z.GetValue(i, 0);
				derivatives.SetValue(i, 0, d * (1.0 - d));
			}
		}
		public string ActivationName { get { return "ReLU/Softmax"; } }
	}
	public class ActivationELUSoftmax : IActivationMethods
	{
		public readonly double alpha;
		public ActivationELUSoftmax(double alpha = 1.0)
		{
			this.alpha = alpha;
		}
		public void ActivationMethod(Matrix outputs) // Exponential Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.ELU(outputs.GetValue(i, j), alpha));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			Functions.Softmax(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.ELUPrime(input, alpha);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives) // this is the Softmax derivative and this may not work with the Quadratic Cost function
		{
			double d;
			for (int i = 0; i < z.rows; i++)
			{
				d = z.GetValue(i, 0);
				derivatives.SetValue(i, 0, d * (1.0 - d));
			}
		}
		public string ActivationName { get { return $"ELU({alpha})/Softmax"; } }
	}
	public class ActivationLeakyReLUSoftmax : IActivationMethods
	{
		public readonly double alpha;
		public ActivationLeakyReLUSoftmax(double alpha = 0.25)
		{
			this.alpha = alpha;
		}
		public void ActivationMethod(Matrix outputs) // Leaky Rectified Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.LeakyReLU(outputs.GetValue(i, j), alpha));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			Functions.Softmax(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.LeakyReLUPrime(input, alpha);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives) // this is the Softmax derivative and this may not work with the Quadratic Cost function
		{
			double d;
			for (int i = 0; i < z.rows; i++)
			{
				d = z.GetValue(i, 0);
				derivatives.SetValue(i, 0, d * (1.0 - d));
			}
		}
		public string ActivationName { get { return $"LeakyReLU({alpha})/Softmax"; } }
	}
	public class ActivationReLUSigmoid : IActivationMethods // useful for networks with binary output (single neuron output)
	{
		public void ActivationMethod(Matrix outputs) // Rectified Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.ReLU(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs) // Used for binary outputs, 0 to 1
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public double Derivative(double input)
		{
			return Functions.ReLUPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.SigmoidPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return "ReLU/Sigmoid"; } }
	}
	public class ActivationELUSigmoid : IActivationMethods // useful for networks with binary output (single neuron output)
	{
		public readonly double alpha;
		public ActivationELUSigmoid(double alpha = 1.0)
		{
			this.alpha = alpha;
		}
		public void ActivationMethod(Matrix outputs) // Rectified Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.ELU(outputs.GetValue(i, j), alpha));
		}
		public void OutputActivationMethod(Matrix outputs) // Used for binary outputs, 0 to 1
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public double Derivative(double input)
		{
			return Functions.ELUPrime(input, alpha);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.SigmoidPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return $"ELU({alpha})/Sigmoid"; } }
	}
	public class ActivationLeakyReLUSigmoid : IActivationMethods // useful for networks with binary output (single neuron output)
	{
		public readonly double alpha;
		public ActivationLeakyReLUSigmoid(double alpha = 0.25)
		{
			this.alpha = alpha;
		}
		public void ActivationMethod(Matrix outputs) // Leaky Rectified Linear Unit function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.LeakyReLU(outputs.GetValue(i, j), alpha));
		}
		public void OutputActivationMethod(Matrix outputs) // Used for binary outputs, 0 to 1
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public double Derivative(double input)
		{
			return Functions.LeakyReLUPrime(input, alpha);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.SigmoidPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return $"LeakyReLU({alpha})/Sigmoid"; } }
	}
	public class ActivationTanhSigmoid : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Tanh function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Tanh(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public double Derivative(double input)
		{
			return Functions.TanhPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.SigmoidPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return "Tanh/Sigmoid"; } }
	}
	public class ActivationTanhSoftmax : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Tanh function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Tanh(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			Functions.Softmax(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.TanhPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives) // this is the Softmax derivative and this may not work with the Quadratic Cost function
		{
			double d;
			for (int i = 0; i < z.rows; i++)
			{
				d = z.GetValue(i, 0);
				derivatives.SetValue(i, 0, d * (1.0 - d));
			}
		}
		public string ActivationName { get { return "Tanh/Softmax"; } }
	}
	public class ActivationTanh : IActivationMethods // Good for regression problems where the goal is to predict continuous values
	{
		public void ActivationMethod(Matrix outputs) // Tanh function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Tanh(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			ActivationMethod(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.TanhPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.TanhPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return "Tanh"; } }
	}
	public class ActivationSigmoidSoftmax : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Sigmoid function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			Functions.Softmax(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.TanhPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives) // this is the Softmax derivative and this may not work with the Quadratic Cost function
		{
			double d;
			for (int i = 0; i < z.rows; i++)
			{
				d = z.GetValue(i, 0);
				derivatives.SetValue(i, 0, d * (1.0 - d));
			}
		}
		public string ActivationName { get { return "Sigmoid/Softmax"; } }
	}
	public class ActivationSigmoid : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Sigmoid function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Sigmoid(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			ActivationMethod(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.SigmoidPrime(input);
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.SigmoidPrime(z.GetValue(i, 0)));
		}
		public string ActivationName { get { return "Sigmoid"; } }
	}
	public class ActivationLinear : IActivationMethods
	{
		public void ActivationMethod(Matrix outputs) // Sigmoid function applied to whole matrix
		{
			for (int i = 0; i < outputs.rows; i++)
				for (int j = 0; j < outputs.columns; j++)
					outputs.SetValue(i, j, Functions.Linear(outputs.GetValue(i, j)));
		}
		public void OutputActivationMethod(Matrix outputs)
		{
			ActivationMethod(outputs);
		}
		public double Derivative(double input)
		{
			return Functions.LinearPrime();
		}
		public void OutputDerivative(Matrix z, Matrix derivatives)
		{
			for (int i = 0; i < z.rows; i++)
				derivatives.SetValue(i, 0, Functions.LinearPrime());
		}
		public string ActivationName { get { return "Linear"; } }
	}
	public interface ICostMethods
	{
		public void Delta(Matrix z, Matrix outputs, Matrix desiredOutputs, Matrix returnValue);
		public double LossFunction(Matrix feedForwardOutputs, Matrix desiredOutputs);
	}
    public class CostQuadratic : ICostMethods // this cost function learns VERY slowly; cross entropy cost functions perform much better
	{
		private readonly IActivationMethods activationObject;
        public CostQuadratic(IActivationMethods activationObject)
		{
			this.activationObject = activationObject;
		}
		public void Delta(Matrix z, Matrix outputs, Matrix desiredOutputs, Matrix returnValue)
		{
			var derivatives = new Matrix(z.rows, z.columns);
			activationObject.OutputDerivative(z, derivatives);
			for (int i = 0; i < returnValue.rows; i++)
				returnValue.SetValue(i, 0, (outputs.GetValue(i, 0) - desiredOutputs.GetValue(i, 0)) * derivatives.GetValue(i, 0));
		}
		public double LossFunction(Matrix feedForwardOutputs, Matrix desiredOutputs)
		{
			return Functions.Quadratic(feedForwardOutputs, desiredOutputs);
		}
	}
	public class CostCategoricalCrossEntropy : ICostMethods
	{
		public void Delta(Matrix z, Matrix outputs, Matrix desiredOutputs, Matrix returnValue)
		{
			for (int i = 0; i < returnValue.rows; i++)
				returnValue.SetValue(i, 0, outputs.GetValue(i, 0) - desiredOutputs.GetValue(i, 0));
		}
		public double LossFunction(Matrix feedForwardOutputs, Matrix desiredOutputs)
		{
			return Functions.CategoricalCrossEntropy(feedForwardOutputs, desiredOutputs);
		}
	}
	public class CostBinaryCrossEntropy : ICostMethods
	{
		public void Delta(Matrix z, Matrix outputs, Matrix desiredOutputs, Matrix returnValue)
		{
			for (int i = 0; i < returnValue.rows; i++)
				returnValue.SetValue(i, 0, outputs.GetValue(i, 0) - desiredOutputs.GetValue(i, 0));
		}
		public double LossFunction(Matrix feedForwardOutputs, Matrix desiredOutputs)
		{
			return Functions.BinaryCrossEntropy(feedForwardOutputs, desiredOutputs);
		}
	}
	public static class Functions
	{
		public static double Linear(double x) // Linear function
		{
			return x;
		}
		public static double LinearPrime() // derivative of Linear function (the line's slope)
		{
			return 1;
		}
		// alpha default might be 0.01, but this can be modified, bigger or smaller; tensorflow uses 0.2 while keras uses 0.3
		public static double LeakyReLU(double x, double alpha) // Rectified Linear Unit function (Leaky variant)
		{
			return x >= 0 ? x : (alpha * x);
		}
		public static double LeakyReLUPrime(double x, double alpha) // derivative of Leaky ReLU function
		{
			return x >= 0 ? 1 : alpha;
		}
		public static double ReLU(double x) // Rectified Linear Unit function
		{
			return x > 0 ? x : 0;
		}
		public static double ReLUPrime(double x) // derivative of ReLU function
		{
			return x > 0 ? 1 : 0;
		}
		public static double ELU(double x, double alpha) // Exponential Linear Unit function
		{
			return x >= 0 ? x : (alpha * (Math.Exp(x) - 1));
		}
		public static double ELUPrime(double x, double alpha) // derivative of ELU function
		{
			return x >= 0 ? 1 : (alpha * Math.Exp(x));
		}
		public static double Tanh(double x)
		{
			return (Math.Exp(x) - Math.Exp(-x)) / (Math.Exp(x) + Math.Exp(-x));
		}
		public static double TanhPrime(double x)
		{
			return 1 - ((Math.Exp(x) - Math.Exp(-x)) / (Math.Exp(x) + Math.Exp(-x))) * ((Math.Exp(x) - Math.Exp(-x)) / (Math.Exp(x) + Math.Exp(-x))); // this is simply: 1 - (tanh(x) * tanh(x))
		}
		public static double Sigmoid(double x)
		{
			return 1.0 / (1 + Math.Exp(-x));
		}
		public static double SigmoidPrime(double x) // derivative of Sigmoid function
		{
			return (1.0 / (1 + Math.Exp(-x))) * (1.0 - (1.0 / (1 + Math.Exp(-x)))); // this is simply: Sigmoid(x) * (1.0 - Sigmoid(x))
		}
		public static void Softmax(Matrix input)
		{
			double max = double.MinValue;

			for (int i = 0; i < input.rows; i++)
				max = Math.Max(max, input.GetValue(i, 0));

			double val, sum = 0;
			for (int i = 0; i < input.rows; i++)
			{
				val = Math.Exp(input.GetValue(i, 0) - max);
				input.SetValue(i, 0, val);
				sum += val;
			}

			for (int i = 0; i < input.rows; i++)
				input.SetValue(i, 0, input.GetValue(i, 0) / sum);
		}
		public static double BinaryCrossEntropy(Matrix feedForwardOutputs, Matrix desiredOutput) // Binary Cross Entropy, both parameters should be a single input: [1, 1] matrix
		{
			const double epsilon = 1e-15;
			double y_true = desiredOutput.GetValue(0, 0);
			double y_pred = Math.Min(1 - epsilon, Math.Max(epsilon, feedForwardOutputs.GetValue(0, 0)));
			return -((y_true * Math.Log(y_pred)) + ((1 - y_true) * Math.Log(1 - y_pred)));
		}
		public static double Quadratic(Matrix feedForwardOutputs, Matrix desiredOutput)
		{
			double sum = 0.0;
			for (int i = 0; i < feedForwardOutputs.rows; i++)
				sum += Math.Pow(feedForwardOutputs.GetValue(i, 0) - desiredOutput.GetValue(i, 0), 2);
			return 0.5 * sum;
		}
		private static int indexMax(Matrix m) // only pass 1 dimension matrices
		{
			int index = -1;
			double a, max = double.MinValue;
			for (int i = 0; i < m.rows; i++)
			{
				a = m.GetValue(i, 0);
				if (a > max)
				{
					max = a;
					index = i;
				}
			}
			return index;
		}

		public static double CategoricalCrossEntropy(Matrix feedForwardOutputs, Matrix desiredOutput) // Categorical Cross Entropy, desiredOutput should be a one-hot vector
		{
			int iO = indexMax(feedForwardOutputs), iL = indexMax(desiredOutput);
			double a = (iO == iL) ? feedForwardOutputs.GetValue(iO, 0) : 0;
			if (a == 0)
				return 1;
			if (a == 1)
				return 0;
			return -Math.Log(a);
		}
	}
}

This site uses cookies. Cookies are simple text files stored on the user's computer. They are used for adding features and security to this site. Read the privacy policy.
CLOSE