summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config.h6
-rw-r--r--gdr_test.cu11
-rw-r--r--ipedma.h15
-rw-r--r--kernels.cu16
4 files changed, 39 insertions, 9 deletions
diff --git a/config.h b/config.h
index acefc3b..7d95924 100644
--- a/config.h
+++ b/config.h
@@ -1,9 +1,13 @@
#define ITERS 100
-#define GPU_ITERS 1000
+#define GPU_ITERS 100
+
#define TLP_SIZE 64
#define GPU_PAGE 65536
+
#define PAGE_SIZE 4096
#define VERBOSE
#define GPU_DESC
+
+#define USE_HW_CONTER
diff --git a/gdr_test.cu b/gdr_test.cu
index 13af482..cc624fc 100644
--- a/gdr_test.cu
+++ b/gdr_test.cu
@@ -172,6 +172,8 @@ int main(int argc, char *argv[]) {
// initAssert (cuMemHostRegister ((void*)((((uintptr_t)bar)/65536)*65536), 65536, CU_MEMHOSTREGISTER_DEVICEMAP));
initAssert (cuMemHostRegister ((void*)bar, 4096, CU_MEMHOSTREGISTER_IOMEMORY));
initAssert (cuMemHostGetDevicePointer(&dBAR, (void*)bar, 0));
+ // no effect
+ //initAssert (cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, dBAR));
bar_info = pcilib_get_bar_info(pci, BAR);
printf("%p (Phys: 0x%lx, Size: 0x%x)\n", bar_info[BAR].virt_addr, bar_info[BAR].phys_addr, bar_info[BAR].size);
@@ -208,6 +210,7 @@ int main(int argc, char *argv[]) {
WR32 (REG_UPDATE_THRESHOLD, 0);
WR64 (REG_UPDATE_ADDRESS, desc_bus);
WR32 (REG_DMA, 1);
+ WR32 (REG_INTERCONNECT, 0x232);
WR32 (REG_COUNTER, 1);
#ifdef VERBOSE
@@ -244,13 +247,19 @@ int main(int argc, char *argv[]) {
double lat = 1000. * kbuf[0] / gpu_props.clockRate;
double latk = 1000. * kbuf[1] / gpu_props.clockRate;
double latc = ((tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.) / GPU_ITERS;
+# ifdef USE_HW_CONTER
+ double lath = 4. * RD32 (0x20) / 1000;
+# else
+ double lath = 0;
+# endif
#else
double lat = (tse.tv_sec - tss.tv_sec)*1000000 + 1. * (tse.tv_nsec - tss.tv_nsec) / 1000.;
double latk = (tsk.tv_sec - tss.tv_sec)*1000000 + 1. * (tsk.tv_nsec - tss.tv_nsec) / 1000.;
double latc = 0;
+ double lath = 0;
#endif
- printf("Latency: %.3lf us / %.3lf us (%.3lf us) %x %x %x %x\n", lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
+ printf("hw: % 6.3lf us, sw: % 6.3lf us, +krn: % 6.3lf us, total: % 7.3lf us: %x %x %x %x\n", lath, lat, latk, latc, kbuf[0], kbuf[1], kbuf[2], kbuf[3]);
#else
if (!i) gettimeofday(&tvs, NULL);
#endif /* VERBOSE */
diff --git a/ipedma.h b/ipedma.h
index 284b058..299499c 100644
--- a/ipedma.h
+++ b/ipedma.h
@@ -1,20 +1,21 @@
#define REG_RESET_DMA 0x00
#define REG_DMA 0x04
#define REG_NUM_PACKETS_PER_DESCRIPTOR 0x10
-#define REG_PERF_COUNTER 0x28
+#define REG_PERF_COUNTER 0x20
+//#define REG_PERF_COUNTER 0x28
#define REG_PACKET_LENGTH 0x0C
#define REG_DESCRIPTOR_ADDRESS 0x50
#define REG_UPDATE_ADDRESS 0x58
#define REG_UPDATE_THRESHOLD 0x60
-
+#define REG_INTERCONNECT 0x9048
#define REG_COUNTER 0x9000
-#define WR32(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value);
-#define RD32(addr) (*(uint32_t *) (((char*)(bar)) + (addr)))
-#define WR32_sleep(addr, value) *(uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100);
+#define WR32(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value);
+#define RD32(addr) (*(volatile uint32_t *) (((char*)(bar)) + (addr)))
+#define WR32_sleep(addr, value) *(volatile uint32_t *) (((char*)(bar)) + (addr)) = (value); usleep (100);
-#define WR64(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value);
-#define RD64(addr) (*(uint64_t *) (((char*)(bar)) + (addr)))
+#define WR64(addr, value) *(volatile uint64_t *) (((char*)(bar)) + (addr)) = (value);
+#define RD64(addr) (*(volatile uint64_t *) (((char*)(bar)) + (addr)))
#define WR64_sleep(addr, value) *(uint64_t *) (((char*)(bar)) + (addr)) = (value); usleep (100);
diff --git a/kernels.cu b/kernels.cu
index 341bb59..3b5e1d7 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -17,8 +17,20 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t
desc[1] = 0;
+ // Reset counter
+#ifdef USE_HW_CONTER
+ WR32 (REG_DMA, 0);
+ WR32 (REG_PERF_COUNTER, 0);
+
+ WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+
+ t1 = clock64();
+ WR32 (REG_DMA, 1);
+#else
t1 = clock64();
WR64 (REG_DESCRIPTOR_ADDRESS, bus_addr);
+#endif
+
do {
if (++wait > 0x10000) break;
@@ -26,6 +38,10 @@ __global__ void ipedma(volatile void *bar, uintptr_t bus_addr, volatile uint64_t
t2 = clock64();
+#ifdef USE_HW_CONTER
+ WR32 (REG_PERF_COUNTER, 1);
+#endif
+
null<<<1,1>>>(data);
cudaDeviceSynchronize();
t3 = clock64();