Square of Hilbert's Matrix
The aim of solving this problem is to learn how to use the XMM registers for
multiplication of floating point numbers. Matrix multiplication is a slow
calculation especially if the floating point unit is used, and hence doing
packed floating point calculations (if double precision is not required) might
just be much faster. So this program will test that.
The Hilbert's matrix is a simple matrix whose each element B(i, j) =
1/(i+j-1) . For more details refer the Wikipedia link .
The Hilbert's matrix is a square matrix, so first we will take the size of the matrix as input from the user and then allocate the memory block for the matrix and store the elements of the matrix. Then, we shall calculate the square of the matrix and print it on screen. The floating point precision level that will be used is single or 4 bytes per number.
%macro prologue 0
push rbp
mov rbp,rsp
push rbx
push r12
push r13
push r14
push r15
%endmacro
%macro epilogue 0
pop r15
pop r14
pop r13
pop r12
pop rbx
leave
ret
%endmacro
section .rodata
prompt1 db "Enter the size of the Hilbert matrix:",0
prompt2 db "The square of the matrix is:",10,0
int_prompt db "%d",0
float_prompt db "%f",10,0
section .text
global main, create_hilbert_matrix, print_float, square_float_matrix, print_float_matrix
extern printf, scanf, malloc, free
extern print_int,print_nl
main:
push rbp
mov rbp, rsp
sub rsp, 4
push rbx
push r12
push r13
push r14
push r15
; Enter the size of the matrix
xor rax, rax
mov rdi, dword prompt1
call printf
xor rax, rax
lea rsi, [rbp-4]
mov rdi, dword int_prompt
call scanf
; Create a Hilbert Matrix
xor rax, rax
xor rdi, rdi
mov edi, [rbp-4]
call create_hilbert_matrix
mov rbx, rax ; store a pointer to the Hilbert matrix
; Calculate the Square of a floating point matrix
; Arguments are the size of the matrix and the pointer to the input matrix
; Return value is the pointer to the square of the input matrix
mov rsi, rax
mov edi, [rbp-4]
call square_float_matrix
mov r12, rax ; store a pointer to the square of the Hilbert matrix
; Print the Hilbert Matrix and its square on the screen.
mov rsi, rbx
mov edi, [rbp-4]
call print_float_matrix
mov rsi, r12
mov edi, [rbp-4]
call print_float_matrix
; End the program and free memory
mov rdi, rbx
xor rax, rax
call free
mov rdi, r12
xor rax, rax
call free
epilogue
create_hilbert_matrix:
prologue
; size of the matrix is in RDI
mov r15, rdi
imul rdi, rdi
imul rdi, 4 ; sizeof(float)
; size of the memory to be allocated in RDI now
xor rax, rax
call malloc
; store a copy of the memory location pointing to the matrix.
mov rbx, rax
mov r12, rax
xor rcx, rcx
outer_loop_start:
cmp rcx, r15
jge outer_loop_end
xor rdx, rdx
inner_loop_start:
cmp rdx, r15
jge inner_loop_end
; each element is RCX+RDX+1
mov rdi, 1
add rdi, rcx
adc rdi, rdx
; convert the integer in RDI to a single precision floating point in XMM0
cvtsi2ss xmm0, edi
; calculate the reciprocal of the single precision floating point in XMM0 and place it in XMM1
rcpss xmm1, xmm0
; place the value in XMM1 in memory allocated for the matrix
movd dword[r12], xmm1
add r12, 4 ; sizeof(float)
inc rdx
jmp inner_loop_start
inner_loop_end:
inc rcx
jmp outer_loop_start
outer_loop_end:
mov rax, rbx
epilogue
square_float_matrix:
prologue
; the Hilbert matrix pointer is in RSI
mov r14, rsi
; size of the matrix is in RDI
mov r15, rdi
imul rdi, rdi
imul rdi, 4 ; sizeof(float)
; size of the memory to be allocated in RDI now
xor rax, rax
call malloc
; store a copy of the memory location pointing to the matrix.
mov rbx, rax
mov r12, rax
; start squaring the matrix
xor rcx, rcx
forloop_1:
cmp rcx, r15
jge end_forloop_1
xor rsi, rsi
forloop_2:
cmp rsi, r15
jge end_forloop_2
xor rdi, rdi
; B[i][j] = 0.0 in XMM0
cvtsi2ss xmm0, rdi
forloop_3:
cmp rdi, r15
jge end_forloop_3
; B[i][j]+= A[i][k]*A[k][j]
mov rax, r15
imul rax, rcx
add rax, rdi
mov r8, r15
imul r8, rdi
add r8, rsi
movd xmm1, dword[r14 + rax*4]
movd xmm2, dword[r14 + r8*4]
mulss xmm1, xmm2
addss xmm0, xmm1
inc rdi
jmp forloop_3
end_forloop_3:
; B[i][j] = XMM0
movd dword [r12], xmm0
add r12, 4 ; sizeof(float)
inc rsi
jmp forloop_2
end_forloop_2:
inc rcx
jmp forloop_1
end_forloop_1:
mov rax, rbx
epilogue
print_float_matrix:
prologue
; size of the matrix is in RDI and pointer is in RSI
mov r15, rdi
mov r14, rsi
xor r12, r12
floop_1:
cmp r12, r15
jge end_floop_1
xor r13, r13
floop_2:
cmp r13, r15
jge end_floop_2
; Loading value to be printed in XMM0
movd xmm0, dword[r14]
add r14,4
call print_float
inc r13
jmp floop_2
end_floop_2:
call print_nl
inc r12
jmp floop_1
end_floop_1:
xor rax, rax
epilogue
print_float:
prologue
; The float is in XMM0
mov rdi, dword float_prompt
mov rax,1
call printf
epilogue

