PROWAREtech
C++: Example CPU Benchmark Program Code
How to create a CPU benchmark program using C++ and a little assembly.
This code was compiled on Visual Studio 2022. The Windows code is x86 compatible and the Linux code is x64 compatible. The Windows code consists of two files, one for C++ and one for assembly.
It's very simple to create a CPU benchmark program. It does require a little assembly to control how the machine code is created.
For the Windows and Linux executables, download CPUBENCH.zip.
First the C++ code (attention was given to make this compile on Windows and Linux):
// ONLY SUPPORT WIN32 AND LINUX64
#ifndef _WIN64
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <thread>
#include <iostream>
#include <chrono>
#ifdef WIN32
extern "C" void stress_cpu(int operations);
extern "C" int cpu_brand(char brand[]);
#else // else go ahead and use the Linux compiler and hope that it does not try to optimize this code
void stress_cpu(int operations)
{
#define M1 asm("addl $1, %eax\nxorl $1, %eax\n");
#define M2 M1 M1 M1 M1 M1
#define M3 M2 M2 M2 M2 M2
#define M4 M3 M3 M3 M3 M3
#define M5 M4 M4 M4 M4 M4
#define M6 M5 M5 M5 M5 M5
asm("movl $1, %eax\n");
for (int i = 0; i < operations; i++)
{
M6
M6
}
}
long long cpu_brand0()
{
asm(
"movl $0x80000002, %eax\n"
"cpuid\n"
"xchg %rbx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rbx\n"
"shr $32, %rbx\n"
"add %rbx, %rax\n"
);
}
long long cpu_brand1()
{
asm(
"movl $0x80000002, %eax\n"
"cpuid\n"
"mov %rdx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rcx\n"
"shr $32, %rcx\n"
"add %rcx, %rax\n"
);
}
long long cpu_brand2()
{
asm(
"movl $0x80000003, %eax\n"
"cpuid\n"
"xchg %rbx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rbx\n"
"shr $32, %rbx\n"
"add %rbx, %rax\n"
);
}
long long cpu_brand3()
{
asm(
"movl $0x80000003, %eax\n"
"cpuid\n"
"mov %rdx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rcx\n"
"shr $32, %rcx\n"
"add %rcx, %rax\n"
);
}
long long cpu_brand4()
{
asm(
"movl $0x80000004, %eax\n"
"cpuid\n"
"xchg %rbx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rbx\n"
"shr $32, %rbx\n"
"add %rbx, %rax\n"
);
}
long long cpu_brand5()
{
asm(
"movl $0x80000004, %eax\n"
"cpuid\n"
"mov %rdx, %rax\n"
"shl $32, %rax\n"
"shl $32, %rcx\n"
"shr $32, %rcx\n"
"add %rcx, %rax\n"
);
}
#endif
#ifdef WIN32
#include <windows.h>
#define pthread_t DWORD
#define pthread_create(THREAD_ID_PTR, ATTR, ROUTINE, PARAMS) CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)ROUTINE,(void*)(PARAMS),0,THREAD_ID_PTR)
#define sleep(ms) Sleep(ms)
#else // Linux
#include <pthread.h>
#include <unistd.h>
#endif
const int MAX_THREADS = std::thread::hardware_concurrency();
const int OPERATIONS = 10000000;
int threads_active;
void* thread(void*)
{
stress_cpu(OPERATIONS);
threads_active--;
return NULL;
}
#define SCORE(THREAD_COUNT) (unsigned int)(10000000ULL / (end_ms - start_ms) * THREAD_COUNT)
int main(int argc, char** argv)
{
#ifdef WIN32
char brand[48 + 16];
brand[48] = 0;
if (cpu_brand(brand)) // retrieve the processor brand -- this requires assembly code
{
std::cout << "CPU: " << brand << std::endl;
}
#else
// retrieve the processor brand -- this requires assembly code
union
{
char brand[48 + 16];
long long x[6];
};
brand[48] = 0;
x[0] = cpu_brand0();
x[1] = cpu_brand1();
x[2] = cpu_brand2();
x[3] = cpu_brand3();
x[4] = cpu_brand4();
x[5] = cpu_brand5();
std::cout << "CPU: " << brand << std::endl;
#endif
unsigned long long start_ms, end_ms;
if (MAX_THREADS > 1)
{
std::cout << "testing multi-core performance with " << MAX_THREADS << " threads" << std::endl;
start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); // start milliseconds
threads_active = 0;
for (int i = 1; i < MAX_THREADS; i++)
{
threads_active++;
pthread_t tid;
pthread_create(&tid, NULL, thread, NULL); // create one thread here
}
stress_cpu(OPERATIONS); // run the cpu stress on the main thread
while (threads_active)
sleep(1); // sleep one millisecond
end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count(); // end milliseconds
std::cout << "multi-core score: " << SCORE(MAX_THREADS) << std::endl;
}
start_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
std::cout << "testing single-core performance" << std::endl;
stress_cpu(OPERATIONS);
end_ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
std::cout << "single-core score: " << SCORE(1) << std::endl << std::endl << "Ctrl+C to end program" << std::endl;
std::cin.get();
return 0;
}
#endif
Now, the assembly code (this was designed to compile with the Microsoft Macro Assembler and is not compatible with Linux):
TITLE 'extern "C" void stress_cpu(int operations);'
TITLE 'extern "C" int cpu_brand(char brand[]);'
M1 MACRO
add eax, 1
xor eax, 1
ENDM
M2 MACRO
M1
M1
M1
M1
M1
ENDM
M3 MACRO
M2
M2
M2
M2
M2
ENDM
M4 MACRO
M3
M3
M3
M3
M3
ENDM
M5 MACRO
M4
M4
M4
M4
M4
ENDM
M6 MACRO
M5
M5
M5
M5
M5
ENDM
.686P
.model FLAT
.safeseh _stress_cpu ; this is required for Visual Studio and the x86 linker (ML.EXE /safeseh)
PUBLIC _stress_cpu
PUBLIC _cpu_brand
_TEXT SEGMENT
_stress_cpu PROC NEAR
mov ecx, DWORD PTR [esp+4] ; load operations parameter from stack
cmp ecx, 0
je exit
mov eax, 1
loop1:
M6
M6
dec ecx
cmp ecx, 0
je exit
jmp loop1
exit:
ret
_stress_cpu ENDP
_cpu_id_supported PROC NEAR ; check that the cpuid instruction is supported
push ebx ; save ebx for the caller
pushfd ; push eflags on the stack
pop eax ; pop them into eax
mov ebx, eax ; save to ebx for restoring afterwards
xor eax, 200000h ; toggle bit 21
push eax ; push the toggled eflags
popfd ; pop them back into eflags
pushfd ; push eflags
pop eax ; pop them back into eax
cmp eax, ebx ; see if bit 21 was reset
jz not_supported
mov eax, 1
jmp exit
not_supported:
xor eax, eax
exit:
pop ebx
ret
_cpu_id_supported ENDP
_cpu_brand PROC NEAR ; copy the cpu name into the first parameter (using cpuid instruction)
push ebx
call _cpu_id_supported
cmp eax, 0
je exit
mov eax, 80000000h
cpuid
cmp eax, 80000004h
jnge not_supported
mov esi, [esp+(1+1)*4]
mov eax, 80000002h
cpuid
mov [esi], eax
mov [esi+4], ebx
mov [esi+8], ecx
mov [esi+12], edx
mov eax, 80000003h
cpuid
mov [esi+16], eax
mov [esi+20], ebx
mov [esi+24], ecx
mov [esi+28], edx
mov eax, 80000004h
cpuid
mov [esi+32], eax
mov [esi+36], ebx
mov [esi+40], ecx
mov [esi+44], edx
mov eax, 1
jmp exit
not_supported:
xor eax, eax
exit:
pop ebx
ret
_cpu_brand ENDP
_TEXT ENDS
END
Some sample benchmark scores run on Windows and Linux machines:
CPU: Intel(R) Core(TM) i5-2520M CPU @ 2.50GHz WINDOWS x86 testing multi-core performance with 4 threads multi-core score: 812 testing single-core performance single-core score: 252 CPU: Intel(R) Core(TM) i7-4770K CPU @ 3.50GHz WINDOWS x86 testing multi-core performance with 8 threads multi-core score: 2520 testing single-core performance single-core score: 343 CPU: Intel(R) Core(TM) i7-3770 CPU @ 3.40GHz WINDOWS x86 testing multi-core performance with 8 threads multi-core score: 1928 testing single-core performance single-core score: 297 CPU: Intel(R) Core(TM) i9-10900K CPU @ 3.70GHz WINDOWS x86 testing multi-core performance with 20 threads multi-core score: 7080 testing single-core performance single-core score: 386 CPU: Intel(R) Core(TM) i7-2760QM CPU @ 2.40GHz WINDOWS x86 testing multi-core performance with 8 threads multi-core score: 1744 testing single-core performance single-core score: 269 CPU: Intel(R) Core(TM) i3-3245 CPU @ 3.40GHz LINUX x64 testing multi-core performance with 4 threads multi-core score: 912 testing single-core performance single-core score: 269
Comment