articles » current » c-plus-plus » algorithms » cpu-benchmark-program

C++: Example CPU Benchmark Program Code

How to create a CPU benchmark program using C++ and a little assembly

This code was compiled on Visual Studio 2022. The Windows code is x86 compatible and the Linux code is x64 compatible. The Windows code consists of two files, one for C++ and one for assembly.

It's very simple to create a CPU benchmark program. It does require a little assembly to control how the machine code is created.

For the Windows and Linux executables, download CPUBENCH.zip.

First the C++ code (attention was given to make this compile on Windows and Linux):


// ONLY SUPPORT WIN32 AND LINUX64
#ifndef _WIN64

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <thread>
#include <iostream>
#include <chrono>


#ifdef WIN32

extern "C" void stress_cpu(int operations);
extern "C" int cpu_brand(char brand[]);

#else // else go ahead and use the Linux compiler and hope that it does not try to optimize this code

void stress_cpu(int operations)
{
#define M1 asm("addl $1, %eax\nxorl $1, %eax\n");
#define M2 M1 M1 M1 M1 M1
#define M3 M2 M2 M2 M2 M2
#define M4 M3 M3 M3 M3 M3
#define M5 M4 M4 M4 M4 M4
#define M6 M5 M5 M5 M5 M5

	asm("movl $1, %eax\n");
	for (int i = 0; i < operations; i++)
	{
		M6
		M6
	}
}

long long cpu_brand0()
{
	asm(
		"movl $0x80000002, %eax\n"
		"cpuid\n"
		"xchg %rbx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rbx\n"
		"shr $32, %rbx\n"
		"add %rbx, %rax\n"
	);
}

long long cpu_brand1()
{
	asm(
		"movl $0x80000002, %eax\n"
		"cpuid\n"
		"mov %rdx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rcx\n"
		"shr $32, %rcx\n"
		"add %rcx, %rax\n"
	);
}

long long cpu_brand2()
{
	asm(
		"movl $0x80000003, %eax\n"
		"cpuid\n"
		"xchg %rbx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rbx\n"
		"shr $32, %rbx\n"
		"add %rbx, %rax\n"
	);
}

long long cpu_brand3()
{
	asm(
		"movl $0x80000003, %eax\n"
		"cpuid\n"
		"mov %rdx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rcx\n"
		"shr $32, %rcx\n"
		"add %rcx, %rax\n"
	);
}

long long cpu_brand4()
{
	asm(
		"movl $0x80000004, %eax\n"
		"cpuid\n"
		"xchg %rbx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rbx\n"
		"shr $32, %rbx\n"
		"add %rbx, %rax\n"
	);
}

long long cpu_brand5()
{
	asm(
		"movl $0x80000004, %eax\n"
		"cpuid\n"
		"mov %rdx, %rax\n"
		"shl $32, %rax\n"
		"shl $32, %rcx\n"
		"shr $32, %rcx\n"
		"add %rcx, %rax\n"
	);
}

#endif

#ifdef WIN32

#include <windows.h>
#define pthread_t DWORD
#define pthread_create(THREAD_ID_PTR, ATTR, ROUTINE, PARAMS) CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)ROUTINE,(void*)(PARAMS),0,THREAD_ID_PTR)
#define sleep(ms) Sleep(ms)

#else // Linux

#include <pthread.h>
#include <unistd.h>

#endif

const int MAX_THREADS = std::thread::hardware_concurrency();
const int OPERATIONS = 10000000;
int threads_active;

void* thread(void*)
{
	stress_cpu(OPERATIONS);
	threads_active--;
	return NULL;
}

#define SCORE(THREAD_COUNT) (unsigned int)(10000000ULL / (end_ms - start_ms) * THREAD_COUNT)

int main(int argc, char** argv)
{
#ifdef WIN32
	char brand[48 + 16];
	brand[48] = 0;
	if (cpu_brand(brand)) // retrieve the processor brand -- this requires assembly code
	{
		std::cout << "CPU: " << brand << std::endl;
	}
#else
	// retrieve the processor brand -- this requires assembly code
	union
	{
		char brand[48 + 16];
		long long x[6];
	};
	brand[48] = 0;
	x[0] = cpu_brand0();
	x[1] = cpu_brand1();
	x[2] = cpu_brand2();
	x[3] = cpu_brand3();
	x[4] = cpu_brand4();
	x[5] = cpu_brand5();
	std::cout << "CPU: " << brand << std::endl;
#endif
	unsigned long long start_ms, end_ms;

	if (MAX_THREADS > 1)
	{
		std::cout << "testing multi-core performance with " << MAX_THREADS << " threads" << std::endl;
		start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); // start milliseconds
		threads_active = 0;
		for (int i = 1; i < MAX_THREADS; i++)
		{
			threads_active++;
			pthread_t tid;
			pthread_create(&tid, NULL, thread, NULL); // create one thread here
		}
		stress_cpu(OPERATIONS); // run the cpu stress on the main thread
		while (threads_active)
			sleep(1); // sleep one millisecond
		end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); // end milliseconds
		std::cout << "multi-core score: " << SCORE(MAX_THREADS) << std::endl;
	}

	start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
	std::cout << "testing single-core performance" << std::endl;
	stress_cpu(OPERATIONS);
	end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
	std::cout << "single-core score: " << SCORE(1) << std::endl << std::endl << "Ctrl+C to end program" << std::endl;

	std::cin.get();

	return 0;
}

#endif

Now, the assembly code (this was designed to compile with the Microsoft Macro Assembler and is not compatible with Linux):


TITLE 'extern "C" void stress_cpu(int operations);'
TITLE 'extern "C" int cpu_brand(char brand[]);'

M1 MACRO
	add eax, 1
	xor eax, 1
ENDM

M2 MACRO
	M1
	M1
	M1
	M1
	M1
ENDM

M3 MACRO
	M2
	M2
	M2
	M2
	M2
ENDM

M4 MACRO
	M3
	M3
	M3
	M3
	M3
ENDM

M5 MACRO
	M4
	M4
	M4
	M4
	M4
ENDM

M6 MACRO
	M5
	M5
	M5
	M5
	M5
ENDM

.686P
.model FLAT
.safeseh _stress_cpu          ; this is required for Visual Studio and the x86 linker (ML.EXE /safeseh)

PUBLIC	_stress_cpu
PUBLIC	_cpu_brand

_TEXT	SEGMENT

_stress_cpu PROC NEAR

	mov ecx, DWORD PTR [esp+4] ; load operations parameter from stack
	cmp ecx, 0
	je exit

	mov eax, 1

loop1:
	M6
	M6
	dec ecx
	cmp ecx, 0
	je exit
	jmp loop1

exit:		
	ret
_stress_cpu ENDP


_cpu_id_supported PROC NEAR    ; check that the cpuid instruction is supported
	
	push ebx                   ; save ebx for the caller
	pushfd                     ; push eflags on the stack
	pop eax                    ; pop them into eax
	mov ebx, eax               ; save to ebx for restoring afterwards
	xor eax, 200000h           ; toggle bit 21
	push eax                   ; push the toggled eflags
	popfd                      ; pop them back into eflags
	pushfd                     ; push eflags
	pop eax                    ; pop them back into eax
	cmp eax, ebx               ; see if bit 21 was reset
	jz not_supported
	
	mov eax, 1
	jmp exit
	
not_supported:
	xor eax, eax

exit:
	pop ebx
	ret
_cpu_id_supported ENDP


_cpu_brand PROC NEAR           ; copy the cpu name into the first parameter (using cpuid instruction)
	
	push ebx

	call _cpu_id_supported
	cmp eax, 0
	je exit

	mov eax, 80000000h
	cpuid
	cmp eax, 80000004h
	jnge not_supported

	mov esi, [esp+(1+1)*4]

	mov eax, 80000002h
	cpuid
	mov [esi], eax
	mov [esi+4], ebx
	mov [esi+8], ecx
	mov [esi+12], edx

	mov eax, 80000003h
	cpuid
	mov [esi+16], eax
	mov [esi+20], ebx
	mov [esi+24], ecx
	mov [esi+28], edx

	mov eax, 80000004h
	cpuid
	mov [esi+32], eax
	mov [esi+36], ebx
	mov [esi+40], ecx
	mov [esi+44], edx

	mov eax, 1
	jmp exit
	

not_supported:
	xor eax, eax
	
exit:
	pop ebx
	ret
_cpu_brand ENDP


_TEXT	ENDS
END

Some sample benchmark scores run on Windows and Linux machines:

CPU:        Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz WINDOWS x86
testing multi-core performance with 4 threads
multi-core score: 812
testing single-core performance
single-core score: 252

CPU: Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz WINDOWS x86
testing multi-core performance with 8 threads
multi-core score: 2520
testing single-core performance
single-core score: 343

CPU:         Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz WINDOWS x86
testing multi-core performance with 8 threads
multi-core score: 1928
testing single-core performance
single-core score: 297

CPU: Intel(R) Core(TM) i9-10900K CPU @ 3.70GHz WINDOWS x86
testing multi-core performance with 20 threads
multi-core score: 7080
testing single-core performance
single-core score: 386

CPU:       Intel(R) Core(TM) i7-2760QM CPU @ 2.40GHz WINDOWS x86
testing multi-core performance with 8 threads
multi-core score: 1744
testing single-core performance
single-core score: 269

CPU:         Intel(R) Core(TM) i3-3245 CPU @ 3.40GHz LINUX x64
testing multi-core performance with 4 threads
multi-core score: 912
testing single-core performance
single-core score: 269

This site uses cookies. Cookies are simple text files stored on the user's computer. They are used for adding features and security to this site. Read the privacy policy.
CLOSE