提交 c5f98527 authored 作者: Frederic Bastien's avatar Frederic Bastien

implemented GpuGemm not inplace version

上级 6658cf32
...@@ -153,14 +153,29 @@ class GpuGemm(Op): ...@@ -153,14 +153,29 @@ class GpuGemm(Op):
Need to check al least refcount. Need to check al least refcount.
""" """
destroy_map = {0:[0]} def __init__(self, inplace):
self.__setstate__({'inplace':inplace})
def __str__(self): def __str__(self):
return 'GpuGemm' if self.inplace: inplace_str = 'inplace'
else: inplace_str = 'no_inplace'
return '%s{%s}' % (self.__class__.__name__, inplace_str)
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return (type(self) == type(other)\
and self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
inplace = dct.get('inplace', True)
if inplace:
self.destroy_map = {0: [0]}
self.inplace = inplace
def __getstate__(self):
return dict(inplace=self.inplace)
def make_node(self, z, a, x, y, b): def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemm is assumed to already # the more complicated error checking performed by tensor.gemm is assumed to already
...@@ -171,10 +186,14 @@ class GpuGemm(Op): ...@@ -171,10 +186,14 @@ class GpuGemm(Op):
return (2,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
#inplace version, set set z_out = z_in
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
fail = sub['fail'] fail = sub['fail']
return """ if self.inplace:
return """
#define REAL float #define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT) float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
...@@ -194,7 +213,43 @@ class GpuGemm(Op): ...@@ -194,7 +213,43 @@ class GpuGemm(Op):
%(z_out)s = %(z_in)s; %(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
""" % locals() """ % locals()
gpu_gemm = GpuGemm() else:
return """
#define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
? (REAL)(((float*)%(a)s->data)[0])
: (REAL)(((double*)%(a)s->data)[0]);
float %(name)s_b = (%(b)s->descr->type_num == PyArray_FLOAT) ?
(REAL)(((float*)%(b)s->data)[0])
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
if ((NULL == %(z_out)s)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1]))
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if(!%(z_out)s) {
PyErr_SetString(PyExc_MemoryError, "failed to alloc GpuGemm{no_inplace} output");
%(fail)s
}
}else{
if(CudaNdarray_CopyFromCudaNdarray(%(z_out)s,%(z_in)s)){
PyErr_SetString(PyExc_MemoryError, "failed to copy input in GpuGemm{no_inplace}");
%(fail)s
}
}
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{
%(fail)s;
}
""" % locals()
gpu_gemm_inplace = GpuGemm(True)
gpu_gemm_no_inplace = GpuGemm(False)
## ##
# Not really a BLAS operation, but whatever. # Not really a BLAS operation, but whatever.
......
...@@ -7,7 +7,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to ...@@ -7,7 +7,7 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv
from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
...@@ -187,6 +187,8 @@ def local_gpu_dot22scalar(node): ...@@ -187,6 +187,8 @@ def local_gpu_dot22scalar(node):
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemm(node): def local_gpu_gemm(node):
""" """
work for inplace and not inplace gemm
gpu_from_host(gemm) -> gpu_gemm(gpu_from_host) gpu_from_host(gemm) -> gpu_gemm(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm) gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
...@@ -195,14 +197,24 @@ def local_gpu_gemm(node): ...@@ -195,14 +197,24 @@ def local_gpu_gemm(node):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace: if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace:
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)] return [gpu_gemm_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
if node.op == tensor.blas.gemm_inplace: elif host_input.owner and host_input.owner.op == tensor.blas.gemm_no_inplace:
z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm_no_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
elif node.op == tensor.blas.gemm_inplace:
z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
elif node.op == tensor.blas.gemm_no_inplace:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))] return [host_from_gpu(gpu_gemm_no_inplace(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False return False
@register_opt() @register_opt()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论