From: Olof Johansson Implement the HCALLs to do more than one TCE setup or invalidation at a time on pSeries LPAR. Previous implementation did one hypervisor call per setup or teardown, resulting in significant overhead. A simple test of "time dd if=/dev/sda of=/dev/null bs=128k" shows the amount of system time go down by about 5% by using the multi-tce calls. Signed-off-by: Olof Johansson Signed-off-by: Andrew Morton --- 25-akpm/arch/ppc64/kernel/pSeries_lpar.c | 126 ++++++++++++++++++++++++++++--- 25-akpm/include/asm-ppc64/hvcall.h | 8 + 2 files changed, 122 insertions(+), 12 deletions(-) diff -puN arch/ppc64/kernel/pSeries_lpar.c~ppc64-make-use-of-batched-iommu-calls-on-pseries-lpars arch/ppc64/kernel/pSeries_lpar.c --- 25/arch/ppc64/kernel/pSeries_lpar.c~ppc64-make-use-of-batched-iommu-calls-on-pseries-lpars 2004-09-01 21:58:02.592169856 -0700 +++ 25-akpm/arch/ppc64/kernel/pSeries_lpar.c 2004-09-01 21:58:02.599168792 -0700 @@ -112,6 +112,22 @@ long plpar_tce_put(unsigned long liobn, return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval); } +long plpar_tce_put_indirect(unsigned long liobn, + unsigned long ioba, + unsigned long page, + unsigned long count) +{ + return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count); +} + +long plpar_tce_stuff(unsigned long liobn, + unsigned long ioba, + unsigned long tceval, + unsigned long count) +{ + return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count); +} + long plpar_get_term_char(unsigned long termno, unsigned long *len_ret, char *buf_ret) @@ -161,6 +177,71 @@ static void tce_build_pSeriesLP(struct i } } +DEFINE_PER_CPU(void *, tce_page) = NULL; + +static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, + enum dma_data_direction direction) +{ + u64 rc; + union tce_entry tce, *tcep; + long l, limit; + + if (npages == 1) + return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + direction); + + tcep = __get_cpu_var(tce_page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() + */ + if (!tcep) { + tcep = (void *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) + return tce_build_pSeriesLP(tbl, tcenum, npages, + uaddr, direction); + __get_cpu_var(tce_page) = tcep; + } + + tce.te_word = 0; + tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT; + tce.te_rdwr = 1; + if (direction != DMA_TO_DEVICE) + tce.te_pciwr = 1; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, npages, PAGE_SIZE/sizeof(union tce_entry)); + + for (l = 0; l < limit; l++) { + tcep[l] = tce; + tce.te_rpn++; + } + + rc = plpar_tce_put_indirect((u64)tbl->it_index, + (u64)tcenum << 12, + (u64)virt_to_abs(tcep), + limit); + + npages -= limit; + tcenum += limit; + } while (npages > 0 && !rc); + + if (rc && printk_ratelimit()) { + printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); + printk("\tindex = 0x%lx\n", (u64)tbl->it_index); + printk("\tnpages = 0x%lx\n", (u64)npages); + printk("\ttce[0] val = 0x%lx\n", tcep[0].te_word); + show_stack(current, (unsigned long *)__get_SP()); + } +} + static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) { u64 rc; @@ -169,23 +250,45 @@ static void tce_free_pSeriesLP(struct io tce.te_word = 0; while (npages--) { - rc = plpar_tce_put((u64)tbl->it_index, + rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, - tce.te_word ); - + tce.te_word); + if (rc && printk_ratelimit()) { - printk("tce_free_pSeriesLP: plpar_tce_put failed\n"); - printk("\trc = %ld\n", rc); + printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); printk("\tindex = 0x%lx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%lx\n", (u64)tcenum); printk("\ttce val = 0x%lx\n", tce.te_word ); show_stack(current, (unsigned long *)__get_SP()); } - + tcenum++; } } + +static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +{ + u64 rc; + union tce_entry tce; + + tce.te_word = 0; + + rc = plpar_tce_stuff((u64)tbl->it_index, + (u64)tcenum << 12, + tce.te_word, + npages); + + if (rc && printk_ratelimit()) { + printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); + printk("\trc = %ld\n", rc); + printk("\tindex = 0x%lx\n", (u64)tbl->it_index); + printk("\tnpages = 0x%lx\n", (u64)npages); + printk("\ttce val = 0x%lx\n", tce.te_word ); + show_stack(current, (unsigned long *)__get_SP()); + } +} + int vtermno; /* virtual terminal# for udbg */ static void udbg_putcLP(unsigned char c) @@ -315,8 +418,13 @@ void pSeriesLP_init_early(void) tce_init_pSeries(); - ppc_md.tce_build = tce_build_pSeriesLP; - ppc_md.tce_free = tce_free_pSeriesLP; + if (cur_cpu_spec->firmware_features & FW_FEATURE_MULTITCE) { + ppc_md.tce_build = tce_buildmulti_pSeriesLP; + ppc_md.tce_free = tce_freemulti_pSeriesLP; + } else { + ppc_md.tce_build = tce_build_pSeriesLP; + ppc_md.tce_free = tce_free_pSeriesLP; + } pci_iommu_init(); @@ -461,7 +569,7 @@ static unsigned long pSeries_lpar_hpte_g /* Do not need RPN to logical page translation */ /* No cross CEC PFT access */ flags = 0; - + lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1); BUG_ON(lpar_rc != H_Success); diff -puN include/asm-ppc64/hvcall.h~ppc64-make-use-of-batched-iommu-calls-on-pseries-lpars include/asm-ppc64/hvcall.h --- 25/include/asm-ppc64/hvcall.h~ppc64-make-use-of-batched-iommu-calls-on-pseries-lpars 2004-09-01 21:58:02.593169704 -0700 +++ 25-akpm/include/asm-ppc64/hvcall.h 2004-09-01 21:58:02.599168792 -0700 @@ -101,10 +101,12 @@ #define H_VIO_SIGNAL 0x104 #define H_SEND_CRQ 0x108 #define H_COPY_RDMA 0x110 -#define H_POLL_PENDING 0x1D8 +#define H_STUFF_TCE 0x138 +#define H_PUT_TCE_INDIRECT 0x13C #define H_VTERM_PARTNER_INFO 0x150 -#define H_REGISTER_VTERM 0x154 -#define H_FREE_VTERM 0x158 +#define H_REGISTER_VTERM 0x154 +#define H_FREE_VTERM 0x158 +#define H_POLL_PENDING 0x1D8 /* plpar_hcall() -- Generic call interface using above opcodes * _