提交 f2743791 authored 作者: James Bergstra's avatar James Bergstra

Merge pull request #375 from nouiz/c_blas

additions and improvements to blas on both CPU and GPU (including GEMV, GER in blas_c)
...@@ -251,6 +251,188 @@ class GpuGemm(Op): ...@@ -251,6 +251,188 @@ class GpuGemm(Op):
gpu_gemm_no_inplace = GpuGemm(inplace=False) gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True) gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuGemv(Op):
"""
implement gemv on the gpu.
"""
def __init__(self, inplace):
self.__setstate__({'inplace':inplace})
def __str__(self):
if self.inplace:
return 'GpuGemv{inplace}'
else:
return 'GpuGemv{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
inplace = dct.get('inplace', True)
if inplace:
self.destroy_map = {0: [0]}
self.inplace = inplace
def __getstate__(self):
return dict(inplace=self.inplace)
def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemv is assumed to already
# have been done
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
#inplace version, set set z_out = z_in
#not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
z_out, = outputs
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];
"""
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 1)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s))
{
%(fail)s;
}
"""
return sio.getvalue() % locals()
gpu_gemv_no_inplace = GpuGemv(inplace=False)
gpu_gemv_inplace = GpuGemv(inplace=True)
class GpuGer(Op):
"""
implement ger on the gpu.
"""
def __init__(self, inplace):
self.__setstate__({'inplace':inplace})
def __str__(self):
if self.inplace:
return 'GpuGer{inplace}'
else:
return 'GpuGer{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
inplace = dct.get('inplace', True)
if inplace:
self.destroy_map = {0: [0]}
self.inplace = inplace
def __getstate__(self):
return dict(inplace=self.inplace)
def make_node(self, z, a, x, y):
# the more complicated error checking performed by tensor.ger is
# assumed to already have been done
return Apply(self, [z, a, x, y], [z.type()])
def c_code_cache_version(self):
return (1,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
#inplace version, set set z_out = z_in
#not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs
z_out, = outputs
fail = sub['fail']
sio = StringIO.StringIO()
print >> sio, """
float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
"""
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
{
%(fail)s;
}
"""
return sio.getvalue() % locals()
gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True)
class GpuOuter(Op): class GpuOuter(Op):
def make_node(self, x, y): def make_node(self, x, y):
# we suppose type checking has been done, but make sure. # we suppose type checking has been done, but make sure.
......
...@@ -3012,6 +3012,92 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, ...@@ -3012,6 +3012,92 @@ int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B,
return 0; return 0;
} }
int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
{
/**
* C <- alpha A B + beta C
* A : matrix
* B, C: vector
* alpha, beta: scalars
*/
if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
// We must allow dimensions to be zeros.
if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
|| (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
{
PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
CudaNdarray_HOST_DIMS(A)[0],
CudaNdarray_HOST_DIMS(A)[1],
CudaNdarray_HOST_DIMS(B)[0],
CudaNdarray_HOST_DIMS(C)[0]);
return -1;
}
// a matrix has non-unit size and non-unit stride in both directions, we can't operate in-place
// TODO: make a copy instead of returning in error
if (((CudaNdarray_HOST_DIMS(A)[0] > 1) && (CudaNdarray_HOST_STRIDES(A)[0] != 1)) && ((CudaNdarray_HOST_DIMS(A)[1] > 1) && (CudaNdarray_HOST_STRIDES(A)[1] != 1)))
{ PyErr_SetString(PyExc_NotImplementedError, "non-unit stride in gemv arg"); return -1; }
// I don't know if cudablas handles negative strides
if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(A)[1] < 0)
|| (CudaNdarray_HOST_STRIDES(B)[0] < 0)
|| (CudaNdarray_HOST_STRIDES(C)[0] < 0))
{
PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)x(%i)->(%i)",
CudaNdarray_HOST_STRIDES(A)[0],
CudaNdarray_HOST_STRIDES(A)[1],
CudaNdarray_HOST_STRIDES(B)[0],
CudaNdarray_HOST_STRIDES(C)[0]);
return -1;
}
/* create appropriate strides for malformed matrices that are row or column
* vectors
*/
int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
if (sa_0 == 1)
{
cublasSgemv('N',
CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
alpha,
CudaNdarray_DEV_DATA(A), sa_1,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else if (sa_1 == 1)
{
cublasSgemv('T',
CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
alpha,
CudaNdarray_DEV_DATA(A), sa_0,
CudaNdarray_DEV_DATA(B), sb_0,
beta,
CudaNdarray_DEV_DATA(C), sc_0);
}
else
{
PyErr_SetString(PyExc_NotImplementedError, "too many strides strides in sgemv");
return -1;
}
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err)
{
PyErr_Format(PyExc_RuntimeError, "cublassGemv failed (%s)",cudaGetErrorString(err));
return -1;
}
return 0;
}
int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) { int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray * A) {
if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; } if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; } if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
...@@ -3033,17 +3119,50 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray ...@@ -3033,17 +3119,50 @@ int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray
PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger"); PyErr_SetString(PyExc_NotImplementedError, "non-c continugous A in sger");
return -1; return -1;
} }
// Since Sger expects A in col-major, we invert x and y to fake this.
int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
CudaNdarray * x_ = x;
if(x_strides == 0){
if(CudaNdarray_HOST_DIMS(x)[0] != 1){
PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_sger: Invalid input x(should not happen)."
" We received an CudaNdarray vector with a stride of 0"
" that have more then 1 elements!");
return -1;
}
x_strides = 4;
} else if(x_strides < 0){
x_ = (CudaNdarray*)CudaNdarray_Copy(x);
x_strides = CudaNdarray_HOST_STRIDES(x_)[0];
}
// Same for this, be safe int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
assert (CudaNdarray_HOST_STRIDES(x)[0] >= 0); CudaNdarray * y_ = y;
assert (CudaNdarray_HOST_STRIDES(y)[0] >= 0); if(y_strides == 0){
if(CudaNdarray_HOST_DIMS(y)[0] != 1){
PyErr_Format(PyExc_RuntimeError,
"CudaNdarray_sger: Invalid input y(should not happen)."
" We received an CudaNdarray vector with a stride of 0"
" that have more then 1 elements!");
return -1;
}
y_strides = 4;
} else if(y_strides < 0){
y_ = (CudaNdarray*)CudaNdarray_Copy(y);
y_strides = CudaNdarray_HOST_STRIDES(y_)[0];
}
// Since Sger expects A in col-major, we invert x and y to fake this. if(CudaNdarray_SIZE(A)){
cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha, cublasSger(CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], alpha,
CudaNdarray_DEV_DATA(y), CudaNdarray_HOST_STRIDES(y)[0], CudaNdarray_DEV_DATA(y_), y_strides,
CudaNdarray_DEV_DATA(x), CudaNdarray_HOST_STRIDES(x)[0], CudaNdarray_DEV_DATA(x_), x_strides,
CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]); CudaNdarray_DEV_DATA(A), CudaNdarray_HOST_DIMS(A)[1]);
}
CNDA_THREAD_SYNC; CNDA_THREAD_SYNC;
if(x_ != x)
Py_DECREF(x_);
if(y_ != y)
Py_DECREF(y_);
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
......
...@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self); ...@@ -320,6 +320,7 @@ DllExport bool CudaNdarray_is_c_contiguous(const CudaNdarray * self);
DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self); DllExport PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self);
DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C); DllExport int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C);
DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A); DllExport int CudaNdarray_sger(float alpha, CudaNdarray * x, CudaNdarray * y, CudaNdarray* A);
DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A); DllExport int CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A);
......
...@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import * ...@@ -16,6 +16,10 @@ from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar,
gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv) gpu_gemm_inplace, gpu_gemm_no_inplace, gpu_outer, GpuConv)
from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax,
GpuDownsampleFactorMaxGrad) GpuDownsampleFactorMaxGrad)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
...@@ -375,47 +379,85 @@ def local_gpu_dot22scalar(node): ...@@ -375,47 +379,85 @@ def local_gpu_dot22scalar(node):
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemv_as_gemm(node): def local_gpu_gemv(node):
""" """
gpu_from_host(gemv) -> gpu_gemv(gpu_from_host) gpu_from_host(gemv) -> gpu_gemv(gpu_from_host)
gemm(host_from_gpu) -> host_from_gpu(gpu_gemv) gemv(host_from_gpu) -> host_from_gpu(gpu_gemv)
This optimization solves the vector-matrix multiplication issue by
transforming the vector into a matrix, apply gpudot22 and reshaping
the output.
A more suitable solution would be to use the right cublas call
""" """
gemvs = {tensor.blas.gemv_inplace: gpu_gemm_inplace, gemvs = {
tensor.blas.gemv_no_inplace: gpu_gemm_no_inplace} tensor.blas.gemv_inplace: gpu_gemv_no_inplace,
tensor.blas.gemv_no_inplace: gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=True): gpu_gemv_no_inplace,
tensor.blas_c.CGemv(inplace=False): gpu_gemv_no_inplace,
}
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gemvs: if host_input.owner and host_input.owner.op in gemvs:
op = host_input.owner.op op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs z, a, x, y, b = host_input.owner.inputs
return [ return [gemvs[op](
GpuDimShuffle((False,True),[0])(gemvs[op]( gpu_from_host(z)
GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z))
, a , a
, gpu_from_host(x) , gpu_from_host(x)
, GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y)) , gpu_from_host(y)
, b))] , b)]
if node.op in gemvs: if node.op in gemvs:
z, a, x, y, b = node.inputs z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu) x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu) y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu) z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(GpuDimShuffle((False,True),[0])( return [host_from_gpu(
gemvs[node.op]( gemvs[node.op](
GpuDimShuffle((False,),[0,'x'])(gpu_from_host(z)) gpu_from_host(z)
, a , a
, gpu_from_host(x) , gpu_from_host(x)
, GpuDimShuffle((False,),[0,'x'])(gpu_from_host(y)) , gpu_from_host(y)
, b)))] , b))]
return False return False
@register_opt()
@local_optimizer([])
def local_gpu_ger(node):
"""
gpu_from_host(ger) -> gpu_ger(gpu_from_host)
ger(host_from_gpu) -> host_from_gpu(gpu_ger)
"""
gers = {
tensor.blas_c.CGer(destructive=True): gpu_ger_no_inplace,
tensor.blas_c.CGer(destructive=False): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=True): gpu_ger_no_inplace,
tensor.blas.Ger(destructive=False): gpu_ger_no_inplace,
}
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op in gers:
op = host_input.owner.op
z, a, x, y = host_input.owner.inputs
return [gers[op](
gpu_from_host(z)
, a
, gpu_from_host(x)
, gpu_from_host(y)
)]
if node.op in gers:
z, a, x, y = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(
gers[node.op](
gpu_from_host(z)
, a
, gpu_from_host(x)
, gpu_from_host(y)
))]
return False
@register_opt() @register_opt()
@local_optimizer([]) @local_optimizer([])
def local_gpu_gemm(node): def local_gpu_gemm(node):
...@@ -424,7 +466,8 @@ def local_gpu_gemm(node): ...@@ -424,7 +466,8 @@ def local_gpu_gemm(node):
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm) gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
""" """
gemms = {tensor.blas.gemm_inplace: gpu_gemm_inplace, gemms = {
#tensor.blas.gemm_inplace: gpu_gemm_inplace,
tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace} tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
if node.op == gpu_from_host: if node.op == gpu_from_host:
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -924,15 +967,32 @@ def local_inplace_gemm(node): ...@@ -924,15 +967,32 @@ def local_inplace_gemm(node):
if node.op == gpu_gemm_no_inplace: if node.op == gpu_gemm_no_inplace:
return [gpu_gemm_inplace(*node.inputs)] return [gpu_gemm_inplace(*node.inputs)]
@local_optimizer([gpu_gemv_no_inplace])
def local_inplace_gemv(node):
if node.op == gpu_gemv_no_inplace:
return [gpu_gemv_inplace(*node.inputs)]
@local_optimizer([gpu_gemm_no_inplace])
def local_inplace_ger(node):
if node.op == gpu_ger_no_inplace:
return [gpu_ger_inplace(*node.inputs)]
# After destroyhandler is in but before we try to make elemwise things inplace # After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gpu gemm inplace # Try to make gpu gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of # Also, need to make the gemm optimisation(step 70) happen before the fusion of
# elemwise(step 71) # elemwise(step 71)
optdb.register('InplaceGpuBlasOpt', optdb.register('InplaceGpuBlasOpt',
EquilibriumOptimizer([local_inplace_gemm], failure_callback=EquilibriumOptimizer.warn_inplace, EquilibriumOptimizer([local_inplace_gemm,
local_inplace_gemv,
local_inplace_ger,
],
failure_callback=EquilibriumOptimizer.warn_inplace,
max_use_ratio=5), max_use_ratio=5),
70.0, 'fast_run', 'inplace', 'gpu') 70.0, 'fast_run', 'inplace', 'gpu')
def get_device_type_sizes(): def get_device_type_sizes():
""" """
:return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu)) :return:(gpu ptr size, cpu ptr size, int sizes(gpu and cpu))
......
from unittest import TestCase
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import tensor from theano import tensor
from theano.tests import unittest_tools from theano.tests import unittest_tools
...@@ -15,6 +17,9 @@ import theano.sandbox.cuda as tcn ...@@ -15,6 +17,9 @@ import theano.sandbox.cuda as tcn
from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad from theano.tensor.signal.downsample import DownsampleFactorMax, DownsampleFactorMaxGrad
import theano.compile.mode import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
if theano.config.mode=='FAST_COMPILE': if theano.config.mode=='FAST_COMPILE':
...@@ -243,3 +248,39 @@ def test_downsample(): ...@@ -243,3 +248,39 @@ def test_downsample():
#We already check that the gpu version return the same value as the gpu version #We already check that the gpu version return the same value as the gpu version
#for GpuDownsampleFactorMaxGrad. So no need to call verify_grad here. #for GpuDownsampleFactorMaxGrad. So no need to call verify_grad here.
class TestGpuGemv(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu
dtype = 'float32'
# As all input are transfered to the gpu, this allow to make all
# the gemv inplace.
gemv = gpu_gemv_inplace
gemv_inplace = gpu_gemv_inplace
class TestGpuGer(TestGer):
def setUp(self):
self.mode = mode_with_gpu
dtype = self.dtype = 'float32' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.ger = gpu_ger_no_inplace
self.ger_destructive = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_no_inplace
# data on the gpu make the op always inplace
self.ger = gpu_ger_inplace
self.gemm = tcn.blas.gpu_gemm_inplace
class TestGpuGer_OpContract(TestCase, unittest_tools.T_OpContractMixin):
def setUp(self):
self.ops = [gpu_ger_no_inplace, gpu_ger_inplace]
def clone(self, op):
return tcn.blas.GpuGer(op.inplace)
...@@ -10,6 +10,7 @@ import opt ...@@ -10,6 +10,7 @@ import opt
import opt_uncanonicalize import opt_uncanonicalize
import blas import blas
import blas_scipy import blas_scipy
import blas_c
import xlogx import xlogx
import raw_random import raw_random
......
...@@ -2540,6 +2540,41 @@ class Alloc(gof.Op): ...@@ -2540,6 +2540,41 @@ class Alloc(gof.Op):
#reuse the allocated memory. #reuse the allocated memory.
out[0][...] = v # broadcast v to fill us up out[0][...] = v # broadcast v to fill us up
def c_code(self, node, name, inp, out, sub):
# TODO: use the elemwise code generator here
if node.inputs[0].ndim == 0:
# filling with a scalar is a common use of alloc
# that we can implement relatively easily
vv = inp[0]
zz, = out
fail = sub['fail']
if node.outputs[0].ndim == 1:
N0 = inp[1]
return """
npy_intp N0 = ((dtype_%(N0)s*)%(N0)s->data)[0];
dtype_%(vv)s vv;
dtype_%(zz)s* zz;
if ((NULL == %(zz)s) || (%(zz)s->dimensions[0] != N0))
{
if (%(zz)s) Py_XDECREF(%(zz)s);
%(zz)s = (PyArrayObject*)PyArray_SimpleNew(1,
&N0, type_num_%(vv)s);
if(!%(zz)s) {
PyErr_SetString(PyExc_MemoryError, "alloc failed");
%(fail)s
}
}
vv = ((dtype_%(vv)s*)%(vv)s->data)[0];
zz = ((dtype_%(zz)s*)%(zz)s->data);
assert (%(zz)s->strides[0] == sizeof(dtype_%(zz)s));
for (int i = 0; i < N0; ++i)
{
zz[i] = vv;
}
""" % locals()
# else pretend this never happened
return super(Alloc, self).c_code(node, name, inp, out, sub)
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
return [node.inputs[1:]] return [node.inputs[1:]]
......
...@@ -6,18 +6,26 @@ Learn more about BLAS here: ...@@ -6,18 +6,26 @@ Learn more about BLAS here:
The standard BLAS libraries implement what is called "legacy BLAS" in that The standard BLAS libraries implement what is called "legacy BLAS" in that
document. document.
This documentation section describes Theano's BLAS optimization This documentation describes Theano's BLAS optimization pipeline.
pipeline.
Where there is a discrepancy between how things do work and how they *should* Where there is a discrepancy between how things do work and how they *should*
work, both aspects should be documented. It helps keep a broader agenda in view work, both aspects should be documented.
even while fixing little bugs etc. from day to day.
There are four kinds of BLAS Ops in Theano:
- Python implementations (this file)
- SciPy-based (blas_scipy)
- C-based (blas_c)
- CUDA-based (theano.sandbox.cuda.blas)
:note: Unfortunately (because it's confusing) this file currently contains Ops
that contain both Python and C versions. I think it would be better to
move the C implementations to blas_c so that this file is pure Python.
-JB
Ops Ops
=== ===
There are two BLAS calls wrapped in this module: GEMM and GEMV.
GEMM: Dot22, Dot22Scalar, GemmRelated, Gemm GEMM: Dot22, Dot22Scalar, GemmRelated, Gemm
------------------------------------------- -------------------------------------------
...@@ -43,18 +51,19 @@ GEMV: Gemv ...@@ -43,18 +51,19 @@ GEMV: Gemv
---------- ----------
The BLAS GEMV operation implements Z <- a X Y + b Z, The BLAS GEMV operation implements Z <- a X Y + b Z,
where Z is a matrix, Y, and Z are vectors, and a and b are scalars. where X is a matrix, Y, and Z are vectors, and a and b are scalars.
Gemv implements the GEMV call in all its generality. GER: Ger
--------
The BLAS GER operation implements Z <- a X' Y + Z,
where X and Y are vectors, and matrix Z gets a rank-1 update.
Other Notable BLAS-related Ops Other Notable BLAS-related Ops
------------------------------ ------------------------------
GpuOuter is currently a wrapper around GER. GER is a useful special case of
GEMM, and in the future it would be good to have a GER Op. With a GER Op here,
the GpuOuter could be turned into a GpuGER.
SYRK is another useful special case of GEMM. Particularly SYRK preserves SYRK is another useful special case of GEMM. Particularly SYRK preserves
symmetry in the matrix that it updates. See how the linear-algebra module uses symmetry in the matrix that it updates. See how the linear-algebra module uses
symmetry hints before implementing this Op, so that this Op is compatible with symmetry hints before implementing this Op, so that this Op is compatible with
...@@ -64,14 +73,19 @@ that system. ...@@ -64,14 +73,19 @@ that system.
Optimizations Optimizations
============= =============
The current optimization pipeline is not exactly clear to me. Instead I will The optimization pipeline works something like this:
describe how it should work.
The high level pipeline is:
1. identify dot22 from dot 1. identify dot22 from dot
2. identify gemm from dot22 2. identify gemm from dot22
3. identify dot22scalar from dot22 that are not gemm 3. identify dot22scalar from dot22 that are not gemm
4. specialize gemm to gemv where applicable 4. specialize gemm to gemv where applicable
5. specialize gemm to ger where applicable
6. specialize dot22 -> gemv or ger where applicable
:note: GEMM is the most canonical BLAS signature that we deal with so far, it
would be good to turn most things into GEMM (dot, inner, outer, dot22,
dot22scalar), and then to specialize from gemm to the various other L2 and
L3 operations.
Identify Dot22 Identify Dot22
-------------- --------------
...@@ -161,9 +175,9 @@ class Gemv(Op): ...@@ -161,9 +175,9 @@ class Gemv(Op):
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
return 'Gemv{inplace}' return '%s{inplace}' % self.__class__.__name__
else: else:
return 'Gemv{no_inplace}' return '%s{no_inplace}' % self.__class__.__name__
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -268,31 +282,19 @@ class Ger(Op): ...@@ -268,31 +282,19 @@ class Ger(Op):
raise TypeError('only float and complex types supported', x.dtype) raise TypeError('only float and complex types supported', x.dtype)
return Apply(self, [A, alpha, x, y], [A.type()]) return Apply(self, [A, alpha, x, y], [A.type()])
def make_thunk(self, node, storage_map, compute_map, no_recycling): def perform(self, node, inp, out):
node_input_storage = [storage_map[r] for r in node.inputs] cA, calpha, cx, cy = inp
node_output_storage = [storage_map[r] for r in node.outputs] cZ, = out
if self.destructive:
# get vars for containers A = cA
cA, calpha, cx, cy = node_input_storage else:
cZ, = node_output_storage A = cA.copy()
if calpha != 1:
A += calpha * numpy.outer(cx, cy)
else:
A += numpy.outer(cx, cy)
cZ[0] = A
def rval():
if self.destructive:
A = cA[0]
else:
A = cA[0].copy()
if calpha[0] != 1:
A += calpha[0] * numpy.outer(cx[0], cy[0])
else:
A += numpy.outer(cx[0], cy[0])
cZ[0] = A
#TODO: If this is currently an unofficial part of the thunk API,
# then maybe it should be documented and made official?
rval.inputs = node_input_storage
rval.outputs = node_output_storage
rval.lazy = False
return rval
ger = Ger(destructive=False) ger = Ger(destructive=False)
ger_destructive = Ger(destructive=True) ger_destructive = Ger(destructive=True)
...@@ -1148,7 +1150,7 @@ def _gemm_from_factored_list(lst): ...@@ -1148,7 +1150,7 @@ def _gemm_from_factored_list(lst):
# Try every pair in the sM_list, trying to turn it into a gemm operation # Try every pair in the sM_list, trying to turn it into a gemm operation
for i in xrange(len(lst) - 1): for i in xrange(len(lst) - 1):
s_i,M_i = lst[i] s_i, M_i = lst[i]
for j in xrange(i+1, len(lst)): for j in xrange(i+1, len(lst)):
s_j, M_j = lst[j] s_j, M_j = lst[j]
...@@ -1400,9 +1402,7 @@ def local_gemm_to_ger(node): ...@@ -1400,9 +1402,7 @@ def local_gemm_to_ger(node):
rval = ger(z, a, xv, yv) rval = ger(z, a, xv, yv)
return [rval] return [rval]
elif bval == 0: # GER on zeros_like should be faster than GEMM elif bval == 0: # GER on zeros_like should be faster than GEMM
zeros = T.alloc( zeros = T.zeros([x.shape[0], y.shape[1]], x.dtype)
numpy.asarray(0, dtype=x.dtype),
x.shape[0], y.shape[1])
rval = ger(zeros, a, xv, yv) rval = ger(zeros, a, xv, yv)
return [rval] return [rval]
else: else:
...@@ -1414,20 +1414,43 @@ def local_gemm_to_ger(node): ...@@ -1414,20 +1414,43 @@ def local_gemm_to_ger(node):
#TODO: delete this optimization when we have the proper dot->gemm->ger pipeline #TODO: delete this optimization when we have the proper dot->gemm->ger pipeline
# working # working
@local_optimizer([_dot22]) @local_optimizer([_dot22])
def local_dot22_to_ger(node): def local_dot22_to_ger_or_gemv(node):
"""GEMM computing an outer-product -> GER """dot22 computing an outer-product -> GER
""" """
if node.op == _dot22: if node.op == _dot22:
x, y = node.inputs x, y = node.inputs
if x.broadcastable[1] and y.broadcastable[0]: xb = x.broadcastable
yb = y.broadcastable
one = T.as_tensor_variable(numpy.asarray(1, dtype=x.dtype))
zero = T.as_tensor_variable(numpy.asarray(0, dtype=x.dtype))
if xb[1] and yb[0]:
# x and y are both vectors so this might qualifies for a GER # x and y are both vectors so this might qualifies for a GER
xv = x.dimshuffle(0) xv = x.dimshuffle(0)
yv = y.dimshuffle(1) yv = y.dimshuffle(1)
one = T.as_tensor_variable(numpy.asarray(1, dtype=x.dtype)) zeros = T.zeros([x.shape[0], y.shape[1]], dtype=x.dtype)
zeros = T.alloc(numpy.asarray(0, dtype=x.dtype), x.shape[0], y.shape[1])
rval = ger(zeros, one, xv, yv) rval = ger(zeros, one, xv, yv)
return [rval] return [rval]
if xb[0] and yb[1]:
# x and y are both vectors so this qualifies for a sdot / ddot
# TODO: Theano doesn't have a sdot, but gemv is better than _dot22
xv = x.dimshuffle(1)
zeros = T.zeros([1], x.dtype)
rval = gemv_no_inplace(zeros, one, y.T, xv, one)
return [rval.dimshuffle('x', 0)]
if xb[0] and not yb[0] and not yb[1]:
# x is vector, y is matrix so try gemv
xv = x.dimshuffle(1)
zeros = T.zeros([y.shape[1]], x.dtype)
rval = gemv_no_inplace(zeros, one, y.T, xv, one)
return [rval.dimshuffle('x', 0)]
if not xb[0] and not xb[1] and yb[1]:
# x is matrix, y is vector, try gemv
yv = y.dimshuffle(0)
zeros = T.zeros([x.shape[0]], dtype=x.dtype)
rval = gemv_no_inplace(zeros, one, x, yv, one)
return [rval.dimshuffle(0, 'x')]
################################# #################################
# #
...@@ -1445,14 +1468,14 @@ optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run') ...@@ -1445,14 +1468,14 @@ optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
blas_optdb.register('local_dot_to_dot22', blas_optdb.register('local_dot_to_dot22',
EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5), EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
0, 'fast_run') 0, 'fast_run')
blas_optdb.register('local_dot_to_gemm', blas_optdb.register('gemm_optimizer',
GemmOptimizer(), GemmOptimizer(),
10, 'fast_run') 10, 'fast_run')
blas_optdb.register('local_gemm_to_gemv', blas_optdb.register('local_gemm_to_gemv',
EquilibriumOptimizer([ EquilibriumOptimizer([
local_gemm_to_gemv, local_gemm_to_gemv,
local_gemm_to_ger, local_gemm_to_ger,
local_dot22_to_ger, local_dot22_to_ger_or_gemv,
local_dimshuffle_lift], local_dimshuffle_lift],
max_use_ratio=5), max_use_ratio=5),
15, 'fast_run') 15, 'fast_run')
......
差异被折叠。
...@@ -23,10 +23,9 @@ try: ...@@ -23,10 +23,9 @@ try:
numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgeru, numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgeru,
numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgeru, numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgeru,
} }
optimizations_enabled = True
except ImportError, e: except ImportError, e:
have_fblas = False have_fblas = False
optimizations_enabled = False
class ScipyGer(Ger): class ScipyGer(Ger):
...@@ -62,13 +61,11 @@ class ScipyGer(Ger): ...@@ -62,13 +61,11 @@ class ScipyGer(Ger):
@local_optimizer([ger, ger_destructive]) @local_optimizer([ger, ger_destructive])
def use_scipy_ger(node): def use_scipy_ger(node):
if not optimizations_enabled: return
if node.op == ger: if node.op == ger:
return [ScipyGer(False)(*node.inputs)] return [ScipyGer(False)(*node.inputs)]
@local_optimizer([ScipyGer(False)]) @local_optimizer([ScipyGer(False)])
def make_ger_destructive(node): def make_ger_destructive(node):
if not optimizations_enabled: return
if node.op == ScipyGer(False): if node.op == ScipyGer(False):
return [ScipyGer(True)(*node.inputs)] return [ScipyGer(True)(*node.inputs)]
......
import sys
import numpy
import theano
import theano.tensor as tensor
from theano.tensor.blas_c import CGer
from theano.tensor.blas_scipy import ScipyGer
from theano.tensor.blas import Ger
from theano.tensor.blas_c import CGemv
from theano.tensor.blas_scipy import ScipyGer
from theano.tensor.blas import Gemv
from theano.tests import unittest_tools
from theano.tests.unittest_tools import TestOptimizationMixin
from test_blas import TestCase
from test_blas import BaseGemv
mode_blas_opt = theano.compile.get_default_mode().including(
'BlasOpt', 'specialize', 'InplaceBlasOpt', 'c_blas')
class TestCGer(TestCase, TestOptimizationMixin):
def setUp(self, dtype='float64'):
self.dtype = dtype
self.mode = theano.compile.get_default_mode().including('fast_run')
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=())
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.Aval = numpy.ones((2,3), dtype=dtype)
self.xval = numpy.asarray([1,2], dtype=dtype)
self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
def function(self, inputs, outputs):
return theano.function(inputs, outputs,
mode=self.mode,
#allow_inplace=True,
)
def run_f(self, f):
return f(self.Aval, self.xval, self.yval)
def b(self, bval):
return tensor.as_tensor_variable(numpy.asarray(bval, dtype=self.dtype))
def test_eq(self):
self.assert_(CGer(True) == CGer(True))
self.assert_(CGer(False) == CGer(False))
self.assert_(CGer(False) != CGer(True))
self.assert_(CGer(True) != ScipyGer(True))
self.assert_(CGer(False) != ScipyGer(False))
self.assert_(CGer(True) != Ger(True))
self.assert_(CGer(False) != Ger(False))
# assert that eq works for non-CGer instances
self.assert_(CGer(False) != None)
self.assert_(CGer(True) != None)
def test_hash(self):
self.assert_(hash(CGer(True)) == hash(CGer(True)))
self.assert_(hash(CGer(False)) == hash(CGer(False)))
self.assert_(hash(CGer(False)) != hash(CGer(True)))
def test_optimization_pipeline(self):
f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=True))
f(self.xval, self.yval) #DebugMode tests correctness
def test_optimization_pipeline_float(self):
self.setUp('float32')
f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=True))
f(self.xval, self.yval) #DebugMode tests correctness
def test_int_fails(self):
self.setUp('int32')
f = self.function([self.x, self.y], tensor.outer(self.x, self.y))
self.assertFunctionContains0(f, CGer(destructive=True))
self.assertFunctionContains0(f, CGer(destructive=False))
def test_A_plus_outer(self):
f = self.function([self.A, self.x, self.y],
self.A + tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=False))
self.run_f(f) #DebugMode tests correctness
def test_A_plus_scaled_outer(self):
f = self.function([self.A, self.x, self.y],
self.A + 0.1 * tensor.outer(self.x, self.y))
self.assertFunctionContains(f, CGer(destructive=False))
self.run_f(f) #DebugMode tests correctness
class TestCGemv(TestCase, TestOptimizationMixin):
"""
Tests of CGemv specifically.
Generic tests of Gemv-compatibility, including both dtypes are done below in
TestCGemvFloat32 and TestCGemvFloat64
"""
def setUp(self, dtype='float64'):
self.dtype = dtype
self.mode = theano.compile.get_default_mode().including('fast_run')
# matrix
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.Aval = numpy.ones((2,3), dtype=dtype)
# vector
self.x = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.y = tensor.tensor(dtype=dtype, broadcastable=(False,))
self.xval = numpy.asarray([1,2], dtype=dtype)
self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
# scalar
self.a = tensor.tensor(dtype=dtype, broadcastable=())
def test_optimizations_vm(self):
''' Test vector dot matrix '''
f = theano.function([self.x, self.A],
theano.dot(self.x, self.A),
mode=self.mode)
# Assert that the dot was optimized somehow
self.assertFunctionContains0(f, tensor.dot)
self.assertFunctionContains1(f, CGemv(True))
# Assert they produce the same output
assert numpy.allclose(f(self.xval, self.Aval),
numpy.dot(self.xval, self.Aval))
def test_optimizations_mv(self):
''' Test matrix dot vector '''
f = theano.function([self.A, self.y],
theano.dot(self.A, self.y),
mode=self.mode)
# Assert that the dot was optimized somehow
self.assertFunctionContains0(f, tensor.dot)
self.assertFunctionContains1(f, CGemv(True))
# Assert they produce the same output
assert numpy.allclose(f(self.Aval, self.yval),
numpy.dot(self.Aval, self.yval))
def t_gemv1(self, m_shp):
''' test vector2 + dot(matrix, vector1) '''
rng = numpy.random.RandomState(unittest_tools.fetch_seed())
v1 = theano.shared(numpy.array(rng.uniform(size=(m_shp[1],)), dtype='float32'))
v2_orig = numpy.array(rng.uniform(size=(m_shp[0],)), dtype='float32')
v2 = theano.shared(v2_orig)
m = theano.shared(numpy.array(rng.uniform(size=m_shp), dtype='float32'))
f = theano.function([], v2 + tensor.dot(m, v1),
mode=self.mode)
# Assert they produce the same output
assert numpy.allclose(f(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()]
assert topo == [CGemv(inplace=False)], topo
#test the inplace version
f = theano.function([], [],
updates={v2:v2+theano.dot(m,v1)},
mode=self.mode)
# Assert they produce the same output
f()
assert numpy.allclose(v2.get_value(),
numpy.dot(m.get_value(), v1.get_value()) + v2_orig)
topo = [n.op for n in f.maker.env.toposort()]
assert topo == [CGemv(inplace=True)]
def test_gemv1(self):
self.t_gemv1((3,2))
self.t_gemv1((0,2))
self.t_gemv1((3,0))
self.t_gemv1((0,0))
def test_gemv_dimensions(self, dtype='float32'):
alpha = theano.shared(theano._asarray(1.0, dtype=dtype),
name='alpha')
beta = theano.shared(theano._asarray(1.0, dtype=dtype),
name='beta')
z = beta * self.y + alpha * tensor.dot(self.A, self.x)
f = theano.function([self.A, self.x, self.y], z,
mode=self.mode)
# Matrix value
A_val = numpy.ones((5,3), dtype=dtype)
# Different vector length
ones_3 = numpy.ones(3, dtype=dtype)
ones_4 = numpy.ones(4, dtype=dtype)
ones_5 = numpy.ones(5, dtype=dtype)
ones_6 = numpy.ones(6, dtype=dtype)
f(A_val, ones_3, ones_5)
self.assertRaises(ValueError, f, A_val, ones_4, ones_5)
self.assertRaises(ValueError, f, A_val, ones_3, ones_6)
self.assertRaises(ValueError, f, A_val, ones_4, ones_6)
class TestCGemvFloat32(TestCase, BaseGemv, TestOptimizationMixin):
mode = mode_blas_opt
dtype = 'float32'
gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True)
class TestCGemvFloat64(TestCase, BaseGemv, TestOptimizationMixin):
mode = mode_blas_opt
dtype = 'float64'
gemv = CGemv(inplace=False)
gemv_inplace = CGemv(inplace=True)
...@@ -4,12 +4,15 @@ import theano ...@@ -4,12 +4,15 @@ import theano
import theano.tensor as tensor import theano.tensor as tensor
from theano.tensor.blas_scipy import ScipyGer from theano.tensor.blas_scipy import ScipyGer
from test_blas import TestCase, TestOptimizationMixin, gemm_no_inplace from test_blas import TestCase, gemm_no_inplace
from theano.tests.unittest_tools import TestOptimizationMixin
class TestScipyGer(TestCase, TestOptimizationMixin): class TestScipyGer(TestCase, TestOptimizationMixin):
def setUp(self): def setUp(self):
self.mode = theano.compile.get_default_mode().including('fast_run') self.mode = theano.compile.get_default_mode()
self.mode = self.mode.including('fast_run')
self.mode = self.mode.excluding('c_blas') # c_blas trumps scipy Ops
dtype = self.dtype = 'float64' # optimization isn't dtype-dependent dtype = self.dtype = 'float64' # optimization isn't dtype-dependent
self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False)) self.A = tensor.tensor(dtype=dtype, broadcastable=(False, False))
self.a = tensor.tensor(dtype=dtype, broadcastable=()) self.a = tensor.tensor(dtype=dtype, broadcastable=())
...@@ -18,9 +21,10 @@ class TestScipyGer(TestCase, TestOptimizationMixin): ...@@ -18,9 +21,10 @@ class TestScipyGer(TestCase, TestOptimizationMixin):
self.Aval = numpy.ones((2,3), dtype=dtype) self.Aval = numpy.ones((2,3), dtype=dtype)
self.xval = numpy.asarray([1,2], dtype=dtype) self.xval = numpy.asarray([1,2], dtype=dtype)
self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype) self.yval = numpy.asarray([1.5,2.7,3.9], dtype=dtype)
if not theano.tensor.blas_scipy.optimizations_enabled: if not theano.tensor.blas_scipy.have_fblas:
self.SkipTest() self.SkipTest()
def function(self, inputs, outputs): def function(self, inputs, outputs):
return theano.function(inputs, outputs, self.mode) return theano.function(inputs, outputs, self.mode)
......
from copy import copy, deepcopy
import sys import sys
import numpy import numpy
import theano.tensor as T import theano.tensor as T
from theano.configparser import config, AddConfigVar, StrParam from theano.configparser import config, AddConfigVar, StrParam
try:
from nose.plugins.skip import SkipTest
except ImportError:
class SkipTest(Exception):
"""
Skip this test
"""
AddConfigVar('unittests.rseed', AddConfigVar('unittests.rseed',
"Seed to use for randomized unit tests. Special value 'random' means using a seed of None.", "Seed to use for randomized unit tests. Special value 'random' means using a seed of None.",
StrParam(666), StrParam(666),
in_c_key=False) in_c_key=False)
def fetch_seed(pseed=None): def fetch_seed(pseed=None):
""" """
Returns the seed to use for running the unit tests. Returns the seed to use for running the unit tests.
...@@ -38,6 +48,7 @@ def fetch_seed(pseed=None): ...@@ -38,6 +48,7 @@ def fetch_seed(pseed=None):
return seed return seed
def seed_rng(pseed=None): def seed_rng(pseed=None):
""" """
Seeds numpy's random number generator with the value returned by fetch_seed. Seeds numpy's random number generator with the value returned by fetch_seed.
...@@ -51,6 +62,7 @@ def seed_rng(pseed=None): ...@@ -51,6 +62,7 @@ def seed_rng(pseed=None):
numpy.random.seed(seed) numpy.random.seed(seed)
return seed return seed
def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs): def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
""" """
Wrapper for tensor/basic.py:verify_grad Wrapper for tensor/basic.py:verify_grad
...@@ -72,3 +84,67 @@ def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs): ...@@ -72,3 +84,67 @@ def verify_grad(op, pt, n_tests=2, rng=None, *args, **kwargs):
# raise # raise
# #
verify_grad.E_grad = T.verify_grad.E_grad verify_grad.E_grad = T.verify_grad.E_grad
class TestOptimizationMixin(object):
def assertFunctionContains(self, f, op, min=1, max=sys.maxint):
toposort = f.maker.env.toposort()
matches = [node for node in toposort if node.op == op]
assert (min <= len(matches) <= max), (toposort, matches, str(op), min, max)
def assertFunctionContains0(self, f, op):
return self.assertFunctionContains(f, op, min=0, max=0)
def assertFunctionContains1(self, f, op):
return self.assertFunctionContains(f, op, min=1, max=1)
def assertFunctionContainsN(self, f, op, N):
return self.assertFunctionContains(f, op, min=N, max=N)
def SkipTest(self, msg='Skip this test'):
raise SkipTest(msg)
# This object name should not start with Test.
# Otherwise nosetests will execute it!
class T_OpContractMixin(object):
# self.ops should be a list of instantiations of an Op class to test.
# self.other_op should be an op which is different from every op
other_op = T.add
def copy(self, x):
return copy(x)
def deepcopy(self, x):
return deepcopy(x)
def clone(self, op):
raise NotImplementedError('return new instance like `op`')
def test_eq(self):
for i, op_i in enumerate(self.ops):
assert op_i == op_i
assert op_i == self.copy(op_i)
assert op_i == self.deepcopy(op_i)
assert op_i == self.clone(op_i)
assert op_i != self.other_op
for j, op_j in enumerate(self.ops):
if i == j: continue
assert op_i != op_j
def test_hash(self):
for i, op_i in enumerate(self.ops):
h_i = hash(op_i)
assert h_i == hash(op_i)
assert h_i == hash(self.copy(op_i))
assert h_i == hash(self.deepcopy(op_i))
assert h_i == hash(self.clone(op_i))
assert h_i != hash(self.other_op)
for j, op_j in enumerate(self.ops):
if i == j: continue
assert op_i != hash(op_j)
def test_name(self):
for op in self.ops:
s = str(op) # show that str works
assert s # names should not be empty
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论