提交 f6f2e926 authored 作者: Thomas George's avatar Thomas George

now uses inplace as op params for GpuGemm, GpuGemmBatch and GpuGemv

上级 94d9e1ca
...@@ -39,6 +39,7 @@ class GpuGemv(BlasOp): ...@@ -39,6 +39,7 @@ class GpuGemv(BlasOp):
Gemv on the GPU. Gemv on the GPU.
""" """
params_type = ParamsType(inplace=Scalar('bool'))
__props__ = ('inplace',) __props__ = ('inplace',)
def __init__(self, inplace=False): def __init__(self, inplace=False):
...@@ -69,9 +70,9 @@ class GpuGemv(BlasOp): ...@@ -69,9 +70,9 @@ class GpuGemv(BlasOp):
beta = beta.astype(expected) beta = beta.astype(expected)
return Apply(self, [y, alpha, A, x, beta], [y.type()]) return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage): def perform(self, node, inputs, out_storage, params):
y, alpha, A, x, beta = inputs y, alpha, A, x, beta = inputs
inplace = self.inplace inplace = params['inplace']
if inplace and y.strides[0] < 0: if inplace and y.strides[0] < 0:
inplace = False inplace = False
if A.shape[1] == 0: if A.shape[1] == 0:
...@@ -83,27 +84,24 @@ class GpuGemv(BlasOp): ...@@ -83,27 +84,24 @@ class GpuGemv(BlasOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3], vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
beta=inp[4], fail=sub['fail'], name=name) beta=inp[4], fail=sub['fail'], name=name,
if self.inplace: params=sub['params'])
code = """ code = """
if (%(y)s->ga.strides[0] <= 0) { if (!%(params)s->inplace || %(y)s->ga.strides[0] <= 0) {
%(out)s = theano_try_copy(%(out)s, %(y)s); %(out)s = theano_try_copy(%(out)s, %(y)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
} else { } else {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(y)s; %(out)s = %(y)s;
Py_INCREF(%(out)s); Py_INCREF(%(out)s);
} }
""" % vars %(out)s = theano_try_copy(%(out)s, %(y)s);
else: if (%(out)s == NULL) {
code = """ %(fail)s
%(out)s = theano_try_copy(%(out)s, %(y)s); }
if (%(out)s == NULL) { """ % vars
%(fail)s
}
""" % vars
# in case of possible speed up using blas dot, # in case of possible speed up using blas dot,
# temporary hack A to 1D for vector-vector dot # temporary hack A to 1D for vector-vector dot
code += """ code += """
...@@ -150,7 +148,7 @@ class GpuGemv(BlasOp): ...@@ -150,7 +148,7 @@ class GpuGemv(BlasOp):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (8,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
...@@ -161,6 +159,7 @@ class GpuGemm(BlasOp): ...@@ -161,6 +159,7 @@ class GpuGemm(BlasOp):
Gemm on the GPU. Gemm on the GPU.
""" """
params_type = ParamsType(inplace=Scalar('bool'))
__props__ = ('inplace',) __props__ = ('inplace',)
_f16_ok = True _f16_ok = True
...@@ -200,9 +199,9 @@ class GpuGemm(BlasOp): ...@@ -200,9 +199,9 @@ class GpuGemm(BlasOp):
assert C.ndim == 2 assert C.ndim == 2
return Apply(self, [C, alpha, A, B, beta], [C.type()]) return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs, params):
C, alpha, A, B, beta = inputs C, alpha, A, B, beta = inputs
inplace = self.inplace inplace = params['inplace']
if inplace and not C.flags.forc: if inplace and not C.flags.forc:
inplace = False inplace = False
outputs[0][0] = blas.gemm(alpha, A, B, beta, C, outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
...@@ -210,35 +209,26 @@ class GpuGemm(BlasOp): ...@@ -210,35 +209,26 @@ class GpuGemm(BlasOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name) beta=inp[4], fail=sub['fail'], name=name,
if self.inplace: params=sub['params'])
code = """ code = """
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) { if (!%(params)s->inplace || !GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
%(fail)s %(fail)s
} }
} else { } else {
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = %(C)s; %(out)s = %(C)s;
Py_INCREF(%(out)s); Py_INCREF(%(out)s);
} }
""" % vars if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
else: ((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
code = """ %(A)s, %(B)s,
%(out)s = theano_try_copy(%(out)s, %(C)s); ((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
if (%(out)s == NULL) { %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
""" % vars
code += """
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s, 0) == -1) {
%(fail)s
}
""" % vars """ % vars
if config.gpuarray.sync: if config.gpuarray.sync:
code += """ code += """
...@@ -247,7 +237,7 @@ class GpuGemm(BlasOp): ...@@ -247,7 +237,7 @@ class GpuGemm(BlasOp):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True) gpugemm_inplace = GpuGemm(inplace=True)
...@@ -308,7 +298,7 @@ class GpuGer(BlasOp): ...@@ -308,7 +298,7 @@ class GpuGer(BlasOp):
} }
if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0], if (pygpu_blas_rger(((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(x)s, %(y)s, %(out)s, 0) == -1) { %(x)s, %(y)s, %(out)s, 0) == -1) {
%(fail)s %(fail)s
} }
""" % vars """ % vars
if config.gpuarray.sync: if config.gpuarray.sync:
...@@ -392,6 +382,7 @@ gpu_dot22 = GpuDot22() ...@@ -392,6 +382,7 @@ gpu_dot22 = GpuDot22()
class GpuGemmBatch(BlasOp): class GpuGemmBatch(BlasOp):
params_type = ParamsType(inplace=Scalar('bool'))
__props__ = ('inplace',) __props__ = ('inplace',)
_f16_ok = True _f16_ok = True
...@@ -428,11 +419,11 @@ class GpuGemmBatch(BlasOp): ...@@ -428,11 +419,11 @@ class GpuGemmBatch(BlasOp):
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3], vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], inplace=int(self.inplace), beta=inp[4], params=sub['params'],
fail=sub['fail'], name=name) fail=sub['fail'], name=name)
code = """ code = """
int err; int err;
if (%(inplace)s){ if (%(params)s->inplace){
if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) { if (!GpuArray_ISONESEGMENT(&%(C)s->ga)) {
%(out)s = theano_try_copy(%(out)s, %(C)s); %(out)s = theano_try_copy(%(out)s, %(C)s);
if (%(out)s == NULL) { if (%(out)s == NULL) {
...@@ -468,7 +459,7 @@ class GpuGemmBatch(BlasOp): ...@@ -468,7 +459,7 @@ class GpuGemmBatch(BlasOp):
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (2,) return (3,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False) gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True) gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论