提交 4031c7b6 authored 作者: James Bergstra's avatar James Bergstra

cuda.blas - added inplace option to GpuGemm (and optimizations)

上级 a73ce6b6
...@@ -153,14 +153,24 @@ class GpuGemm(Op): ...@@ -153,14 +153,24 @@ class GpuGemm(Op):
Need to check al least refcount. Need to check al least refcount.
""" """
destroy_map = {0:[0]} def __init__(self, inplace):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0:[0]}
def __str__(self): def __str__(self):
return 'GpuGemm' if self.inplace:
return 'GpuGemm{inplace}'
else:
return 'GpuGemm{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other) and self.inplace == other.inplace
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
self.inplace = dct.get('inplace', True)
def make_node(self, z, a, x, y, b): def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemm is assumed to already # the more complicated error checking performed by tensor.gemm is assumed to already
...@@ -168,13 +178,16 @@ class GpuGemm(Op): ...@@ -168,13 +178,16 @@ class GpuGemm(Op):
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return ()
return (2,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
z_out, = outputs z_out, = outputs
fail = sub['fail'] fail = sub['fail']
return """ sio = StringIO.StringIO()
print >> sio, """
#define REAL float #define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT) float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
...@@ -186,15 +199,46 @@ class GpuGemm(Op): ...@@ -186,15 +199,46 @@ class GpuGemm(Op):
: (REAL)(((double*)%(b)s->data)[0]); : (REAL)(((double*)%(b)s->data)[0]);
#undef REAL #undef REAL
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_in)s)) """
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = CudaNdarray_Copy(%(z_in)s);
if (!(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{ {
%(fail)s; %(fail)s;
} }
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
""" % locals() """ % locals()
gpu_gemm = GpuGemm() gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True)
## ##
# Not really a BLAS operation, but whatever. # Not really a BLAS operation, but whatever.
......
...@@ -7,7 +7,8 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to ...@@ -7,7 +7,8 @@ from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, to
from theano.sandbox.cuda.basic_ops import * from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace,
gpu_gemm_no_inplace, GpuConv)
from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
...@@ -191,18 +192,21 @@ def local_gpu_gemm(node): ...@@ -191,18 +192,21 @@ def local_gpu_gemm(node):
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm) gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
""" """
gemms = {tensor.blas.gemm_inplace: gpu_gemm_inplace,
tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace: if host_input.owner and host_input.owner.op in gemms:
op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)] return [gemms[op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
if node.op == tensor.blas.gemm_inplace: if node.op in gemms:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))] return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False return False
@register_opt() @register_opt()
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论