mov 4096, %ecx
mov 0, [addr+%ecx]
loop @@j0 ; dec %ecx + jnz @@j0
mov srcaddr, %esi ; Source
mov dstaddr, %edi ; Destination
mov 1024, %ecx ; Number to copy
rep movsd ; Copy 4-byte double-words
This is easily pipelined, internalized, and also runs 1/4 the operations. Unfortunately it requires a big sweep through two pages of data; the CPU's cache algorithms should keep this up to date, but will probably destroy 8k of cache in the process.
Copyright © 2017, Eklektix, Inc.
Comments and public postings are copyrighted by their creators.
Linux is a registered trademark of Linus Torvalds