1 files changed, 28 insertions, 20 deletions
diff --git a/gdr_test.cu b/gdr_test.cu
index cc624fc..d0118d5 100644
--- a/gdr_test.cu
+++ b/gdr_test.cu
@@ -93,11 +93,11 @@ int main(int argc, char *argv[]) {
         char gpu_name[30] = {0};
         initAssert (cuDeviceGetName (gpu_name, 30, current_gpu));
 
-        printf("GPU %i: %s\n", num_gpus, gpu_name);
+        printf(" GPU %i: %s\n", num_gpus, gpu_name);
 
 
         if (strncmp (gpu_name, "Tesla K40", 9) == 0) {
-            printf ("Found a Tesla GPU! I'll use that one.\n");
+            printf ("  Found a Tesla GPU! I'll use that one.\n");
             gpu = current_gpu;
             break;
         }
@@ -112,12 +112,12 @@ int main(int argc, char *argv[]) {
         //(see https://www.cs.cmu.edu/afs/cs/academic/class/15668-s11/www/cuda-doc/html/group__CUDART__DRIVER.html)
     unsigned int api_version;
     initAssert (cuCtxGetApiVersion (context, &api_version));
-    printf ("CUDA API Version: %u\n", api_version);
-    printf ("CUDA init done\n\n");
+    printf (" CUDA API Version: %u\n", api_version);
+    //printf ("CUDA init done\n\n");
 
     CUdevprop gpu_props;
     initAssert(cuDeviceGetProperties(&gpu_props, gpu));
-    printf ("Clock %lu KHz\n", gpu_props.clockRate);
+    printf (" GPU Clock %lu KHz\n", gpu_props.clockRate);
 
     CUdeviceptr d_A, d_D;
     initAssert(cuMemAlloc(&d_D, GPU_PAGE));	// Should be multiple of GPU page, or mapping of next allocation will segfault the gdrcopy module
@@ -149,7 +149,7 @@ int main(int argc, char *argv[]) {
     int A_bar_off = A_info.va - d_A;
     volatile uint32_t *A = (uint32_t *)((char *)A_bar_ptr + A_bar_off);
 
-    printf("DevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu, Page: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size, A_info.page_size);
+    printf("\nDevicePtr: %lx, GDR ptr: %p, Bus ptr: %lx, (Bar: %p, Offset: %i), VA: 0x%lx, Size: %lu\n", d_A, A, A_info.bus_addr, A_bar_ptr, A_bar_off, A_info.va, A_info.mapped_size);
 
     pcilib_t *pci;
     volatile void *bar;
@@ -166,7 +166,7 @@ int main(int argc, char *argv[]) {
         printf("map bar\n");
         exit(1);
     }
-    printf("BAR mapped to: %p\n", bar);
+    //printf("BAR mapped to: %p\n", bar);
 
     CUdeviceptr dBAR;
 //    initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP));
@@ -176,7 +176,7 @@ int main(int argc, char *argv[]) {
     //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));
 
     bar_info = pcilib_get_bar_info(pci, BAR);
-    printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);
+    printf("Bar: %p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);
 
     pcilib_kmem_handle_t *kdesc_kmem = pcilib_alloc_kernel_memory (pci, PCILIB_KMEM_TYPE_CONSISTENT, 1, 128, 4096, KMEM_USE_RING, KMEM_DEFAULT_FLAGS);
     uintptr_t kdesc_bus = pcilib_kmem_get_block_ba (pci, kdesc_kmem, 0);
@@ -196,6 +196,7 @@ int main(int argc, char *argv[]) {
     uintptr_t desc_bus = kdesc_bus;
 #endif
 
+    printf("\nSize: %lu bytes (%lu %lu-byte descriptors with packet length set to %lu), GPU itertions: %lu, Iterations: %lu\n", SIZE, NUM_PAGES, PAGE_SIZE, TLP_SIZE, GPU_ITERS, ITERS);
 
     memset ((uint32_t *)desc, 0, 5 * sizeof (uint32_t));
     volatile uint64_t *hwaddr = (uint64_t*)((char*)desc + 2 * sizeof(uint32_t));
@@ -205,16 +206,22 @@ int main(int argc, char *argv[]) {
     WR32 (REG_RESET_DMA, 0);
     usleep (100000);
 
-    WR32 (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
-    WR32 (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
-    WR32 (REG_UPDATE_THRESHOLD, 0);
-    WR64 (REG_UPDATE_ADDRESS, desc_bus);
-    WR32 (REG_DMA, 1);
-    WR32 (REG_INTERCONNECT, 0x232);
-    WR32 (REG_COUNTER, 1);
+    WR32_sleep (REG_NUM_PACKETS_PER_DESCRIPTOR, PAGE_SIZE / (4 * TLP_SIZE));
+    WR32_sleep (REG_PACKET_LENGTH, 0x80000 | TLP_SIZE);
+    WR32_sleep (REG_UPDATE_THRESHOLD, 1);
+    WR64_sleep (REG_UPDATE_COUNTER, desc_bus);
+    WR64_sleep (REG_UPDATE_ADDRESS, desc_bus + DESCRIPTOR_OFFSET);
+    WR32_sleep (REG_DMA, 1);
+    WR32_sleep (REG_INTERCONNECT, 0x232);
+    WR32_sleep (REG_COUNTER, 1);
+    
+    usleep(100000);
 
 #ifdef VERBOSE
-    struct timespec tss, tse, tsk;
+    struct timespec tss, tsk;
+# ifndef GPU_DESC
+    struct timespec tse;
+# endif
 #else
     struct timeval tvs, tve;
 #endif /* VERBOSE */
@@ -223,7 +230,7 @@ int main(int argc, char *argv[]) {
 	clock_gettime(CLOCK_REALTIME, &tss);
 
 #ifdef GPU_DESC
-	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint64_t*)d_D, (uint32_t*)d_A);
+	ipedma<<<1, 1>>>((void*)dBAR, A_info.bus_addr, (uint32_t*)d_D, (uint64_t*)(d_D + DESCRIPTOR_OFFSET), (uint32_t*)d_A);
 #else
 	WR64 (REG_DESCRIPTOR_ADDRESS, A_info.bus_addr);
 //    WR64 (REG_DESCRIPTOR_ADDRESS, kbuf_bus);
@@ -234,7 +241,8 @@ int main(int argc, char *argv[]) {
 
 	null<<<1, 1>>>((uint32_t*)d_A);
 #endif
-	cudaDeviceSynchronize();
+	err = cudaDeviceSynchronize();
+	if (err) printf("Oopps, synchronization error %i", err);
 
 	clock_gettime(CLOCK_REALTIME, &tsk);
 
@@ -246,7 +254,7 @@ int main(int argc, char *argv[]) {
 # ifdef GPU_DESC
 	double lat = 1000. * kbuf[0] / gpu_props.clockRate;
 	double latk = 1000. * kbuf[1] / gpu_props.clockRate;
-	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
+	double latc = ((tsk.tv_sec - tss.tv_sec)*1000000. + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
 # ifdef USE_HW_CONTER
 	double lath = 4. * RD32 (0x20) / 1000;
 # else
@@ -288,7 +296,7 @@ int main(int argc, char *argv[]) {
     pcilib_free_kernel_memory(pci, kdesc_kmem,  KMEM_DEFAULT_FLAGS);
 
     pcilib_close(pci);
-    printf("PCI closed\n");
+    printf("\nPCI closed\n");
 
 
     gdr_unmap(g, A_mh, A_bar_ptr, PAGE_SIZE);