提交 1d26096a authored 作者: Olivier Delalleau's avatar Olivier Delalleau

Merge pull request #531 from nouiz/crash_fix

Crash fix
...@@ -44,6 +44,9 @@ Internal changes ...@@ -44,6 +44,9 @@ Internal changes
Crash Fix Crash Fix
* Don't try to use blas library when told to don't use them(Frederic B.) * Don't try to use blas library when told to don't use them(Frederic B.)
* When importing theano on a computer without without gpu with the Theano
flags device or init_gpu_device to gpu* (Frederic B., Reported Luo Heng)
============= =============
Release Notes Release Notes
......
...@@ -101,6 +101,7 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'): ...@@ -101,6 +101,7 @@ if config.device.startswith('gpu') or config.init_gpu_device.startswith('gpu'):
# We can't test the driver during import of theano.sandbox.cuda as # We can't test the driver during import of theano.sandbox.cuda as
# this cause circular import dependency. So we also test it manually # this cause circular import dependency. So we also test it manually
# after the import # after the import
if theano.sandbox.cuda.cuda_available:
import theano.sandbox.cuda.tests.test_driver import theano.sandbox.cuda.tests.test_driver
theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1() theano.sandbox.cuda.tests.test_driver.test_nvidia_driver1()
......
from theano import Op, Type, Apply, Variable, Constant import os
from theano import tensor, scalar import StringIO
import StringIO, os
import cuda_ndarray.cuda_ndarray as cuda from theano import Apply
from theano import tensor
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
class GpuDot22(GpuOp): class GpuDot22(GpuOp):
""" """
Implement dot(2d, 2d) on the gpu. Implement dot(2d, 2d) on the gpu.
""" """
def __str__(self): def __str__(self):
return 'GpuDot22' return 'GpuDot22'
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -25,10 +27,10 @@ class GpuDot22(GpuOp): ...@@ -25,10 +27,10 @@ class GpuDot22(GpuOp):
raise TypeError(y) raise TypeError(y)
otype = CudaNdarrayType( otype = CudaNdarrayType(
(x.type.broadcastable[0], y.type.broadcastable[1])) (x.type.broadcastable[0], y.type.broadcastable[1]))
return Apply(self, [x,y], [otype()]) return Apply(self, [x, y], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,1) return (1, 1)
def c_code(self, node, nodename, inputs, outputs, sub): def c_code(self, node, nodename, inputs, outputs, sub):
x, y = inputs x, y = inputs
...@@ -46,8 +48,10 @@ class GpuDot22(GpuOp): ...@@ -46,8 +48,10 @@ class GpuDot22(GpuOp):
%(fail)s; %(fail)s;
} }
if ((NULL == %(z)s) if ((NULL == %(z)s)
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1])) CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1]))
{ {
//if (%(z)s) Py_DECREF(%(z)s); //if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
...@@ -55,7 +59,8 @@ class GpuDot22(GpuOp): ...@@ -55,7 +59,8 @@ class GpuDot22(GpuOp):
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1]; dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
%(z)s = (CudaNdarray*)CudaNdarray_New(); %(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims)) if ((NULL == %(z)s) ||
CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
{ {
if (%(z)s) if (%(z)s)
{ {
...@@ -77,12 +82,14 @@ class GpuDot22(GpuOp): ...@@ -77,12 +82,14 @@ class GpuDot22(GpuOp):
""" % locals() """ % locals()
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
class GpuDot22Scalar(GpuOp): class GpuDot22Scalar(GpuOp):
""" """
Implement dot(2d, 2d) * scalar on the gpu. Implement dot(2d, 2d) * scalar on the gpu.
""" """
def __str__(self): def __str__(self):
return 'GpuDot22Scalar' return 'GpuDot22Scalar'
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) return type(self) == type(other)
...@@ -98,10 +105,10 @@ class GpuDot22Scalar(GpuOp): ...@@ -98,10 +105,10 @@ class GpuDot22Scalar(GpuOp):
raise TypeError(a) raise TypeError(a)
otype = CudaNdarrayType( otype = CudaNdarrayType(
(x.type.broadcastable[0], y.type.broadcastable[1])) (x.type.broadcastable[0], y.type.broadcastable[1]))
return Apply(self, [x,y,a], [otype()]) return Apply(self, [x, y, a], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,1) return (1, 1)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
x, y, a = inputs x, y, a = inputs
...@@ -124,9 +131,11 @@ class GpuDot22Scalar(GpuOp): ...@@ -124,9 +131,11 @@ class GpuDot22Scalar(GpuOp):
%(fail)s; %(fail)s;
} }
if ((NULL == %(z)s) if ((NULL == %(z)s) ||
|| (CudaNdarray_HOST_DIMS(%(z)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) (CudaNdarray_HOST_DIMS(%(z)s)[0] !=
|| (CudaNdarray_HOST_DIMS(%(z)s)[1] != CudaNdarray_HOST_DIMS(%(y)s)[1])) CudaNdarray_HOST_DIMS(%(x)s)[0]) ||
(CudaNdarray_HOST_DIMS(%(z)s)[1] !=
CudaNdarray_HOST_DIMS(%(y)s)[1]))
{ {
//if (%(z)s) Py_DECREF(%(z)s); //if (%(z)s) Py_DECREF(%(z)s);
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
...@@ -134,7 +143,8 @@ class GpuDot22Scalar(GpuOp): ...@@ -134,7 +143,8 @@ class GpuDot22Scalar(GpuOp):
dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0]; dims[0] = CudaNdarray_HOST_DIMS(%(x)s)[0];
dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1]; dims[1] = CudaNdarray_HOST_DIMS(%(y)s)[1];
%(z)s = (CudaNdarray*)CudaNdarray_New(); %(z)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(z)s) || CudaNdarray_alloc_contiguous(%(z)s, 2, dims)) if ((NULL == %(z)s) ||
CudaNdarray_alloc_contiguous(%(z)s, 2, dims))
{ {
if (%(z)s) if (%(z)s)
{ {
...@@ -156,13 +166,14 @@ class GpuDot22Scalar(GpuOp): ...@@ -156,13 +166,14 @@ class GpuDot22Scalar(GpuOp):
""" % locals() """ % locals()
gpu_dot22scalar = GpuDot22Scalar() gpu_dot22scalar = GpuDot22Scalar()
class GpuGemm(GpuOp): class GpuGemm(GpuOp):
""" """
implement the gemm on the gpu. implement the gemm on the gpu.
""" """
def __init__(self, inplace): def __init__(self, inplace):
self.__setstate__({'inplace':inplace}) self.__setstate__({'inplace': inplace})
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
...@@ -187,8 +198,8 @@ class GpuGemm(GpuOp): ...@@ -187,8 +198,8 @@ class GpuGemm(GpuOp):
return dict(inplace=self.inplace) return dict(inplace=self.inplace)
def make_node(self, z, a, x, y, b): def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemm is assumed to already # the more complicated error checking performed by tensor.gemm
# have been done # is assumed to already have been done
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -270,13 +281,14 @@ class GpuGemm(GpuOp): ...@@ -270,13 +281,14 @@ class GpuGemm(GpuOp):
gpu_gemm_no_inplace = GpuGemm(inplace=False) gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True) gpu_gemm_inplace = GpuGemm(inplace=True)
class GpuGemv(GpuOp): class GpuGemv(GpuOp):
""" """
implement gemv on the gpu. implement gemv on the gpu.
""" """
def __init__(self, inplace): def __init__(self, inplace):
self.__setstate__({'inplace':inplace}) self.__setstate__({'inplace': inplace})
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
...@@ -301,8 +313,8 @@ class GpuGemv(GpuOp): ...@@ -301,8 +313,8 @@ class GpuGemv(GpuOp):
return dict(inplace=self.inplace) return dict(inplace=self.inplace)
def make_node(self, z, a, x, y, b): def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemv is assumed to already # the more complicated error checking performed by tensor.gemv
# have been done # is assumed to already have been done
return Apply(self, [z, a, x, y, b], [z.type()]) return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -333,7 +345,8 @@ class GpuGemv(GpuOp): ...@@ -333,7 +345,8 @@ class GpuGemv(GpuOp):
Py_INCREF(%(z_out)s); Py_INCREF(%(z_out)s);
} }
else if (%(z_out)s else if (%(z_out)s
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] ==
CudaNdarray_HOST_DIMS(%(z_in)s)[0])
&& ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0) && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
|| ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0) || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
&& (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1)))) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
...@@ -355,7 +368,8 @@ class GpuGemv(GpuOp): ...@@ -355,7 +368,8 @@ class GpuGemv(GpuOp):
} }
} }
if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s)) if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s,
%(name)s_beta, %(z_out)s))
{ {
%(fail)s; %(fail)s;
} }
...@@ -364,13 +378,14 @@ class GpuGemv(GpuOp): ...@@ -364,13 +378,14 @@ class GpuGemv(GpuOp):
gpu_gemv_no_inplace = GpuGemv(inplace=False) gpu_gemv_no_inplace = GpuGemv(inplace=False)
gpu_gemv_inplace = GpuGemv(inplace=True) gpu_gemv_inplace = GpuGemv(inplace=True)
class GpuGer(GpuOp): class GpuGer(GpuOp):
""" """
implement ger on the gpu. implement ger on the gpu.
""" """
def __init__(self, inplace): def __init__(self, inplace):
self.__setstate__({'inplace':inplace}) self.__setstate__({'inplace': inplace})
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
...@@ -468,6 +483,7 @@ class GpuGer(GpuOp): ...@@ -468,6 +483,7 @@ class GpuGer(GpuOp):
gpu_ger_no_inplace = GpuGer(inplace=False) gpu_ger_no_inplace = GpuGer(inplace=False)
gpu_ger_inplace = GpuGer(inplace=True) gpu_ger_inplace = GpuGer(inplace=True)
class GpuOuter(GpuOp): class GpuOuter(GpuOp):
""" Implement outer on the gpu.""" """ Implement outer on the gpu."""
def make_node(self, x, y): def make_node(self, x, y):
...@@ -519,8 +535,10 @@ class GpuOuter(GpuOp): ...@@ -519,8 +535,10 @@ class GpuOuter(GpuOp):
Py_INCREF(%(name)sy); Py_INCREF(%(name)sy);
} }
if (!(%(A)s && if (!(%(A)s &&
CudaNdarray_HOST_DIMS(%(A)s)[0] == CudaNdarray_HOST_DIMS(%(x)s)[0] && CudaNdarray_HOST_DIMS(%(A)s)[0] ==
CudaNdarray_HOST_DIMS(%(A)s)[1] == CudaNdarray_HOST_DIMS(%(y)s)[0] && CudaNdarray_HOST_DIMS(%(x)s)[0] &&
CudaNdarray_HOST_DIMS(%(A)s)[1] ==
CudaNdarray_HOST_DIMS(%(y)s)[0] &&
CudaNdarray_is_c_contiguous(%(A)s))) { CudaNdarray_is_c_contiguous(%(A)s))) {
Py_XDECREF(%(A)s); Py_XDECREF(%(A)s);
int dims[2]; int dims[2];
...@@ -541,7 +559,9 @@ class GpuOuter(GpuOp): ...@@ -541,7 +559,9 @@ class GpuOuter(GpuOp):
CudaNdarray_HOST_DIMS(%(A)s)[1]); CudaNdarray_HOST_DIMS(%(A)s)[1]);
if (cudaSuccess != cudaMemset(%(A)s->devdata, 0, total_size)) if (cudaSuccess != cudaMemset(%(A)s->devdata, 0, total_size))
{ {
PyErr_Format(PyExc_MemoryError, "GpuOuter: Error memsetting %%d bytes of device memory.", total_size); PyErr_Format(PyExc_MemoryError,
"GpuOuter: Error memsetting %%d bytes of device memory.",
total_size);
Py_DECREF(%(name)sy); Py_DECREF(%(name)sy);
Py_DECREF(%(name)sx); Py_DECREF(%(name)sx);
%(fail)s; %(fail)s;
...@@ -554,10 +574,11 @@ class GpuOuter(GpuOp): ...@@ -554,10 +574,11 @@ class GpuOuter(GpuOp):
if (%(name)sres) { if (%(name)sres) {
%(fail)s; %(fail)s;
} }
"""%dict(x=x,y=y,A=A,fail=fail,name=name) """ % dict(x=x, y=y, A=A, fail=fail, name=name)
gpu_outer = GpuOuter() gpu_outer = GpuOuter()
## ##
# Not really a BLAS operation, but whatever. # Not really a BLAS operation, but whatever.
# #
...@@ -574,7 +595,7 @@ class GpuConv(GpuOp): ...@@ -574,7 +595,7 @@ class GpuConv(GpuOp):
raise ValueError(mode) raise ValueError(mode)
def __init__(self, border_mode, def __init__(self, border_mode,
subsample=(1,1), subsample=(1, 1),
logical_img_hw=None, logical_img_hw=None,
logical_kern_hw=None, logical_kern_hw=None,
logical_kern_align_top=True, logical_kern_align_top=True,
...@@ -591,30 +612,32 @@ class GpuConv(GpuOp): ...@@ -591,30 +612,32 @@ class GpuConv(GpuOp):
the execution of the convolution. Mostly used for the execution of the convolution. Mostly used for
optimization or debugging. optimization or debugging.
:param kshp: The size of the kernel. If provided, can genera :param kshp: The size of the kernel. If provided, can genera
faster code. If the GpuConv op is automatically inserted, faster code. If the GpuConv op is automatically
inserted,
we take its value automatically from the Conv op. we take its value automatically from the Conv op.
:param imshp: The size of the image. Not used for code generation but :param imshp: The size of the image. Not used for code generation but
allow to select an experimental new version in another repo. allow to select an experimental new version in another
repo.
""" """
self.border_mode = border_mode self.border_mode = border_mode
self.subsample = subsample self.subsample = subsample
if logical_img_hw is not None: if logical_img_hw is not None:
h,w = logical_img_hw h, w = logical_img_hw
#TODO: reconsider this... since shapes are not given in constructor, #TODO: reconsider this... since shapes are not given in
# maybe a multiplier + offset is a more appropriate way of passing this logical # constructor, maybe a multiplier + offset is a more
# grid # appropriate way of passing this logical grid
logical_img_hw = tuple(logical_img_hw) logical_img_hw = tuple(logical_img_hw)
self.logical_img_hw = logical_img_hw self.logical_img_hw = logical_img_hw
if logical_kern_hw is not None: if logical_kern_hw is not None:
h,w = logical_kern_hw h, w = logical_kern_hw
#TODO: reconsider this... since shapes are not given in constructor, #TODO: reconsider this... since shapes are not given in
# maybe a multiplier + offset is a more appropriate way of passing this logical # constructor, maybe a multiplier + offset is a more
# grid # appropriate way of passing this logical grid
logical_kern_hw = tuple(logical_kern_hw) logical_kern_hw = tuple(logical_kern_hw)
self.logical_kern_hw = logical_kern_hw self.logical_kern_hw = logical_kern_hw
self.logical_kern_align_top = logical_kern_align_top self.logical_kern_align_top = logical_kern_align_top
self.version=version self.version = version
self.verbose=verbose self.verbose = verbose
self.kshp = kshp self.kshp = kshp
self.imshp = imshp self.imshp = imshp
...@@ -632,11 +655,12 @@ class GpuConv(GpuOp): ...@@ -632,11 +655,12 @@ class GpuConv(GpuOp):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
if not hasattr(self,"imshp"): if not hasattr(self, "imshp"):
self.imshp = None self.imshp = None
def __hash__(self): def __hash__(self):
# don't use hash(self.version) as hash(-1)==-2 and hash(-2)==-2 in python! # don't use hash(self.version) as hash(-1)==-2 and
# hash(-2)==-2 in python!
return hash(type(self)) \ return hash(type(self)) \
^ hash(self.border_mode) \ ^ hash(self.border_mode) \
^ hash(self.subsample) \ ^ hash(self.subsample) \
...@@ -649,7 +673,8 @@ class GpuConv(GpuOp): ...@@ -649,7 +673,8 @@ class GpuConv(GpuOp):
^ hash(self.imshp) ^ hash(self.imshp)
def __str__(self): def __str__(self):
return '%s{%s, %s, %s, %s, %s, %s, %s}' %(self.__class__.__name__, return '%s{%s, %s, %s, %s, %s, %s, %s}' % (
self.__class__.__name__,
self.border_mode, self.border_mode,
str(self.subsample), str(self.subsample),
str(self.logical_img_hw), str(self.logical_img_hw),
...@@ -664,26 +689,30 @@ class GpuConv(GpuOp): ...@@ -664,26 +689,30 @@ class GpuConv(GpuOp):
if kern.type.ndim != 4: if kern.type.ndim != 4:
raise TypeError('kern must be 4D tensor') raise TypeError('kern must be 4D tensor')
broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0], False, False] broadcastable = [img.type.broadcastable[0], kern.type.broadcastable[0],
False, False]
return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()]) return Apply(self, [img, kern], [CudaNdarrayType(broadcastable)()])
def c_compile_args(self): def c_compile_args(self):
nb = 0 nb = 0
if self.kshp is not None: if self.kshp is not None:
nb = self.kshp[1] nb = self.kshp[1]
return ['-DTHEANO_KERN_WID='+str(nb)]#,'-g','-G'] return ['-DTHEANO_KERN_WID=' + str(nb)] # ,'-g','-G']
def c_headers(self): def c_headers(self):
return ['cuda_ndarray.cuh','<stdio.h>'] return ['cuda_ndarray.cuh', '<stdio.h>']
def c_code_cache_version(self): def c_code_cache_version(self):
return (0, 17) # raise this whenever modifying any of the support_code_files # raise this whenever modifying any of the support_code_files
return (0, 17)
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# REMEMBER TO RAISE c_code_cache_version when changing any of these files # REMEMBER TO RAISE c_code_cache_version when changing any of
return open(os.path.join(os.path.split(__file__)[0],'conv_kernel.cu')).read()+\ # these files
open(os.path.join(os.path.split(__file__)[0],'conv_full_kernel.cu')).read()+\ files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
open(os.path.join(os.path.split(__file__)[0],'conv.cu')).read() codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files]
return reduce(str.__add__, codes)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
img, kern = inp img, kern = inp
...@@ -716,15 +745,18 @@ class GpuConv(GpuOp): ...@@ -716,15 +745,18 @@ class GpuConv(GpuOp):
} }
else else
{ {
PyErr_SetString(PyExc_ValueError, "mode must be one of 'full' or 'valid'"); PyErr_SetString(PyExc_ValueError,
"mode must be one of 'full' or 'valid'");
return NULL; return NULL;
} }
CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s, %(out)s, CudaNdarray * out2 = (CudaNdarray *)CudaNdarray_Conv(%(img)s, %(kern)s,
mode, dx, dy, version, verbose); %(out)s, mode,
dx, dy,
version, verbose);
Py_XDECREF(%(out)s); Py_XDECREF(%(out)s);
%(out)s = out2; %(out)s = out2;
"""%sub """ % sub
class GpuDownsampleFactorMax(GpuOp): class GpuDownsampleFactorMax(GpuOp):
...@@ -736,13 +768,17 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -736,13 +768,17 @@ class GpuDownsampleFactorMax(GpuOp):
self.ignore_border = ignore_border self.ignore_border = ignore_border
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border return (type(self) == type(other) and
self.ds == other.ds and
self.ignore_border == other.ignore_border)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border) return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self): def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border) return '%s{%s,%s}' % (self.__class__.__name__,
self.ds,
self.ignore_border)
def make_node(self, x): def make_node(self, x):
if not isinstance(x.type, CudaNdarrayType): if not isinstance(x.type, CudaNdarrayType):
...@@ -750,10 +786,12 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -750,10 +786,12 @@ class GpuDownsampleFactorMax(GpuOp):
if not x.type.ndim == 4: if not x.type.ndim == 4:
raise TypeError() raise TypeError()
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
#def perform(self, node, input_storage, output_storage): #def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') #raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return (3) return (3)
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
x, = inp x, = inp
z, = out z, = out
...@@ -779,7 +817,10 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -779,7 +817,10 @@ class GpuDownsampleFactorMax(GpuOp):
dims[3] += (xdim3%%(%(ds1)s)?1:0); dims[3] += (xdim3%%(%(ds1)s)?1:0);
} }
if(dims[3]>512){ if(dims[3]>512){
PyErr_Format(PyExc_ValueError, "GpuDownsampleFactorMax: last dimention size of %%d is bigger then 512. This case is not implemented.", dims[3]); PyErr_Format(PyExc_ValueError,
"GpuDownsampleFactorMax: last dimention size of %%d"
" is bigger then 512. This case is not implemented.",
dims[3]);
%(fail)s; %(fail)s;
} }
...@@ -796,17 +837,19 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -796,17 +837,19 @@ class GpuDownsampleFactorMax(GpuOp):
{ {
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = NULL; %(z)s = NULL;
PyErr_SetString(PyExc_ValueError, "Was not able to allocate output!"); PyErr_SetString(PyExc_ValueError,
"Was not able to allocate output!");
%(fail)s; %(fail)s;
} }
} }
{ {
dim3 grid(dims[0] * dims[1], dims[2]); dim3 grid(dims[0] * dims[1], dims[2]);
//dim3 block(std::min(dims[3], 512)); //TODO: implement this by supporting more //dim3 block(std::min(dims[3], 512));
//outputs than threads //TODO: implement this by supporting more outputs than threads
dim3 block(dims[3]); dim3 block(dims[3]);
if ((grid.x*grid.y) && dims[3]) if ((grid.x*grid.y) && dims[3])
kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block, xdim3*sizeof(float)>>>( kMaxPool_%(nodename)s<%(ds0)s, %(ds1)s> <<<grid, block,
xdim3*sizeof(float)>>>(
dims[0], dims[1], dims[2], dims[3], xdim2, xdim3, dims[0], dims[1], dims[2], dims[3], xdim2, xdim3,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_DEV_DATA(%(x)s),
CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[0],
...@@ -818,7 +861,9 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -818,7 +861,9 @@ class GpuDownsampleFactorMax(GpuOp):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i;"
" block: %%i x %%i x %%i)\\n",
"kMaxPool_%(nodename)s", "kMaxPool_%(nodename)s",
cudaGetErrorString(err), cudaGetErrorString(err),
grid.x, grid.x,
...@@ -847,7 +892,9 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -847,7 +892,9 @@ class GpuDownsampleFactorMax(GpuOp):
extern __shared__ float xbuf[]; //size [xD3] extern __shared__ float xbuf[]; //size [xD3]
for (int r2 = 0; (r2 < pf2) && (%(ignore_border)s || (r2 + i2*pf2 < xD2)); ++r2) for (int r2 = 0;
(r2 < pf2) && (%(ignore_border)s || (r2 + i2*pf2 < xD2));
++r2)
{ {
__syncthreads(); __syncthreads();
// load the current row of the image into shared memory // load the current row of the image into shared memory
...@@ -860,7 +907,9 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -860,7 +907,9 @@ class GpuDownsampleFactorMax(GpuOp):
// initialize our max if this is the first row we're loading // initialize our max if this is the first row we're loading
cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max; cur_max = (r2 == 0) ? xbuf[threadIdx.x*pf3] : cur_max;
// do a mini-reduction over the pf3 relevant elements in the current row // do a mini-reduction over the pf3 relevant elements
// in the current row
if (%(ignore_border)s) if (%(ignore_border)s)
{ {
for (int k = 0; k < pf3; ++k) for (int k = 0; k < pf3; ++k)
...@@ -887,6 +936,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -887,6 +936,7 @@ class GpuDownsampleFactorMax(GpuOp):
} }
""" % locals() """ % locals()
class GpuDownsampleFactorMaxGrad(GpuOp): class GpuDownsampleFactorMaxGrad(GpuOp):
""" """
Implement the grad of downsample with max on the gpu. Implement the grad of downsample with max on the gpu.
...@@ -896,16 +946,21 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -896,16 +946,21 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
self.ignore_border = ignore_border self.ignore_border = ignore_border
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) and self.ds == other.ds and self.ignore_border == other.ignore_border return (type(self) == type(other) and
self.ds == other.ds and
self.ignore_border == other.ignore_border)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border) return hash(type(self)) ^ hash(self.ds) ^ hash(self.ignore_border)
def __str__(self): def __str__(self):
return '%s{%s,%s}' % (self.__class__.__name__, self.ds, self.ignore_border) return '%s{%s,%s}' % (self.__class__.__name__,
self.ds,
self.ignore_border)
def make_node(self, x, z, gz): def make_node(self, x, z, gz):
return Apply(self, [x, z, gz], [x.type()]) return Apply(self, [x, z, gz], [x.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
#return () #return ()
return (5,) return (5,)
...@@ -925,15 +980,20 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -925,15 +980,20 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
%(fail)s; %(fail)s;
} }
if ((NULL == %(gx)s) if ((NULL == %(gx)s)
|| (CudaNdarray_HOST_DIMS(%(gx)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(gx)s)[0] !=
|| (CudaNdarray_HOST_DIMS(%(gx)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1]) CudaNdarray_HOST_DIMS(%(x)s)[0])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[2] != CudaNdarray_HOST_DIMS(%(x)s)[2]) || (CudaNdarray_HOST_DIMS(%(gx)s)[1] !=
|| (CudaNdarray_HOST_DIMS(%(gx)s)[3] != CudaNdarray_HOST_DIMS(%(x)s)[3])) CudaNdarray_HOST_DIMS(%(x)s)[1])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[2] !=
CudaNdarray_HOST_DIMS(%(x)s)[2])
|| (CudaNdarray_HOST_DIMS(%(gx)s)[3] !=
CudaNdarray_HOST_DIMS(%(x)s)[3]))
{ {
Py_XDECREF(%(gx)s); Py_XDECREF(%(gx)s);
%(gx)s = (CudaNdarray*)CudaNdarray_New(); %(gx)s = (CudaNdarray*)CudaNdarray_New();
if ((NULL == %(gx)s) if ((NULL == %(gx)s)
|| CudaNdarray_alloc_contiguous(%(gx)s, 4, CudaNdarray_HOST_DIMS(%(x)s))) || CudaNdarray_alloc_contiguous(%(gx)s, 4,
CudaNdarray_HOST_DIMS(%(x)s)))
{ {
Py_XDECREF(%(gx)s); Py_XDECREF(%(gx)s);
%(gx)s = NULL; %(gx)s = NULL;
...@@ -942,7 +1002,8 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -942,7 +1002,8 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
} }
{ {
//TODO: supporting more output columns than threads //TODO: supporting more output columns than threads
// make sure we cover every x row when ignore border isset and there's a border present to be ignored // make sure we cover every x row when ignore border isset and
// there's a border present to be ignored
int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s); int needs_extra_z_col = %(ignore_border)s && (CudaNdarray_HOST_DIMS(%(x)s)[2] %% %(ds0)s);
dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0],CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0)); dim3 grid(CudaNdarray_HOST_DIMS(%(z)s)[0],CudaNdarray_HOST_DIMS(%(z)s)[2] + (needs_extra_z_col ? 1 : 0));
dim3 block(std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], 512)); dim3 block(std::min(CudaNdarray_HOST_DIMS(%(x)s)[3], 512));
...@@ -974,7 +1035,8 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -974,7 +1035,8 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
cudaError_t err = cudaGetLastError(); cudaError_t err = cudaGetLastError();
if( cudaSuccess != err) if( cudaSuccess != err)
{ {
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n", PyErr_Format(PyExc_RuntimeError,
"Cuda error: %%s: %%s. (grid: %%i x %%i; block: %%i x %%i x %%i)\\n",
"kDownsampleMaxGrad_%(nodename)s", "kDownsampleMaxGrad_%(nodename)s",
cudaGetErrorString(err), cudaGetErrorString(err),
grid.x, grid.x,
...@@ -988,16 +1050,19 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -988,16 +1050,19 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
# This code considers every position in the output z, andthen computes the gradient for the # This code considers every position in the output z, andthen
# input pixels that were downsampled to that z-position. It does so by running along every # computes the gradient for the input pixels that were
# z row (sometimes plus one, to make sure every gx row gets totally filled), and by # downsampled to that z-position. It does so by running along
# running along every x col. This code is not sensitive to the ignore_border flag along # every z row (sometimes plus one, to make sure every gx row
# the row dimension (since it runs for every position in the output z), but it is sensitive # gets totally filled), and by running along every x col. This
# along the col dimension. # code is not sensitive to the ignore_border flag along the
# row dimension (since it runs for every position in the
# output z), but it is sensitive along the col dimension.
ignore_border = int(self.ignore_border) ignore_border = int(self.ignore_border)
return """ return """
template<int ds0, int ds1> // ds0 is the downsampling factor in rows, ds1 in columns // ds0 is the downsampling factor in rows, ds1 in columns
template<int ds0, int ds1>
__global__ void kDownsampleMaxGrad_%(nodename)s( __global__ void kDownsampleMaxGrad_%(nodename)s(
int D0, int D1, int D2, int D3, int xD2, int xD3, int D0, int D1, int D2, int D3, int xD2, int xD3,
const float * x, int xS0, int xS1, int xS2, int xS3, const float * x, int xS0, int xS1, int xS2, int xS3,
...@@ -1016,18 +1081,24 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1016,18 +1081,24 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
float cur_max, cur_x, my_z, my_gz; float cur_max, cur_x, my_z, my_gz;
int i0 = blockIdx.x; // image row int i0 = blockIdx.x; // image row
int i1 = 0; // image col int i1 = 0; // image col
int i2 = blockIdx.y; // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2 (as needed to cover all x rows) // row wrt z and/or gz, ranges from 0 to D2 - 1 OR D2
// (as needed to cover all x rows)
int i2 = blockIdx.y;
int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1 int x_col = threadIdx.x; // col wrt x, ranges from 0 to xD3 - 1
int z_col = x_col/ds1; // z_col corresponding to this x_col int z_col = x_col/ds1; // z_col corresponding to this x_col
//TODO: raise occupancy. Use threadIdx.y to run several iterations of this i1 loop //TODO: raise occupancy. Use threadIdx.y to run several
//in parallel // iterations of this i1 loop in parallel
for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x) for (i1 = 0; i1 < D1; ++i1) // loop over images (same for z and x)
{ {
for(int col_iter = 0; col_iter * blockDim.x <= xD3 ; col_iter++){ for(int col_iter = 0;
//The if inside is to don't do the division if we need only 1 col_iter col_iter * blockDim.x <= xD3 ; col_iter++){
//The if inside is to don't do the division if we
// need only 1 col_iter
if(blockDim.x != xD3) if(blockDim.x != xD3)
{ {
x_col = threadIdx.x + col_iter * blockDim.x; x_col = threadIdx.x + col_iter * blockDim.x;
...@@ -1036,32 +1107,42 @@ class GpuDownsampleFactorMaxGrad(GpuOp): ...@@ -1036,32 +1107,42 @@ class GpuDownsampleFactorMaxGrad(GpuOp):
if (%(ignore_border)s && x_col >= ds1 * D3) if (%(ignore_border)s && x_col >= ds1 * D3)
{ {
// This happens only if x_col was ignored (via ignore_border) // This happens only if x_col was ignored
// TODO: if ignore_border is False, this is impossible and we don't even // (via ignore_border)
// need to generate this code. // TODO: if ignore_border is False, this is impossible
// and we don't even need to generate this code.
my_gz = 0.0f; my_gz = 0.0f;
//any fp number suffices for my_z, so we don't even need to set it to
//anything in particular. //any fp number suffices for my_z, so we don't even
//need to set it to anything in particular.
} }
else else
{ {
// this is effectively: // this is effectively:
// my_gz = gz[image_row][image_col][z_row][z_col] // my_gz = gz[image_row][image_col][z_row][z_col]
// my_z = z[image_row][image_col][z_row][z_col] // my_z = z[image_row][image_col][z_row][z_col]
my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 + z_col*gzS3]; my_gz = gz[i0 * gzS0 + i1 * gzS1 + i2 * gzS2 +
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 + z_col* zS3]; z_col*gzS3];
my_z = z[i0 * zS0 + i1 * zS1 + i2 * zS2 +
z_col* zS3];
} }
if(x_col<xD3){ if(x_col<xD3){
for (int x_row = i2*ds0; (x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row) for (int x_row = i2*ds0;
(x_row < i2*ds0+ds0) && (x_row < xD2); ++x_row)
{ {
// this is effectively: // this is effectively:
// gx[image_row][image_col][x_row][x_col] // gx[image_row][image_col][x_row][x_col]
// = (my_z == x[image_row][image_col][x_row][x_col]) ? my_gz : 0.0f; // = (my_z == x[image_row][image_col][
gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] // x_row][x_col]) ? my_gz : 0.0f;
= (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 + x_col*xS3]) ? my_gz : 0.0f; gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
} x_row*xD3 + x_col]
//gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 + x_row*xD3 + x_col] = -999; = (my_z == x[i0*xS0 + i1*xS1 + x_row*xS2 +
x_col*xS3]) ? my_gz : 0.0f;
}
//gx[i0 * D1*xD2*xD3 + i1*xD2*xD3 +
x_row*xD3 + x_col] = -999;
} }
} }
......
...@@ -39,12 +39,12 @@ Dot22Scalar is a GEMM where b=0 and Z is allocated every time. ...@@ -39,12 +39,12 @@ Dot22Scalar is a GEMM where b=0 and Z is allocated every time.
Gemm is a GEMM in all its generality. Gemm is a GEMM in all its generality.
In the future we can refactor the GemmRelated, Gemm, Dot22 and In the future we can refactor the GemmRelated, Gemm, Dot22 and
Dot22Scalar Ops into a single Op. That new Op (Gemm2) is basically a normal Gemm, but Dot22Scalar Ops into a single Op. That new Op (Gemm2) is basically a
with an additional configuration variable that says to ignore the input Z. normal Gemm, but with an additional configuration variable that says
Setting that configuration variable to True would make Gemm2 equivalent to the to ignore the input Z. Setting that configuration variable to True
current Dot22 and Dot22Scalar. This would make the file a lot easier to read, would make Gemm2 equivalent to the current Dot22 and Dot22Scalar.
and save a few hundred lines of library, to say nothing of testing and This would make the file a lot easier to read, and save a few hundred
documentation. lines of library, to say nothing of testing and documentation.
GEMV: Gemv GEMV: Gemv
...@@ -109,21 +109,24 @@ This is complicated, done in GemmOptimizer. ...@@ -109,21 +109,24 @@ This is complicated, done in GemmOptimizer.
Identify Dot22Scalar from Dot22 Identify Dot22Scalar from Dot22
------------------------------- -------------------------------
Dot22 Ops that remain after the GemmOptimizer is done have not qualified as GEMM Dot22 Ops that remain after the GemmOptimizer is done have not
Ops. Still they might be scaled by a factor, in which case we use Dot22Scalar qualified as GEMM Ops. Still they might be scaled by a factor, in
which is like Gemm, but without the b and the Z. In the future it would be good which case we use Dot22Scalar which is like Gemm, but without the b
to merge this into the GemmOptimizer. and the Z. In the future it would be good to merge this into the
GemmOptimizer.
Specialize Gemm to Gemv Specialize Gemm to Gemv
----------------------- -----------------------
If arguments to GEMM are dimshuffled vectors, then we can use GEMV instead. This If arguments to GEMM are dimshuffled vectors, then we can use GEMV
optimization is `local_gemm_to_gemv`. instead. This optimization is `local_gemm_to_gemv`.
""" """
import copy
import logging, copy, os, sys import logging
import os
import sys
import numpy import numpy
import numpy.distutils import numpy.distutils
...@@ -137,7 +140,7 @@ from theano.compile.mode import optdb ...@@ -137,7 +140,7 @@ from theano.compile.mode import optdb
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
import theano.scalar import theano.scalar
import basic as T import basic as T
from theano.tensor.blas_headers import blas_header_text #, cblas_header_text from theano.tensor.blas_headers import blas_header_text
from theano.tensor.opt import local_dimshuffle_lift from theano.tensor.opt import local_dimshuffle_lift
_logger = logging.getLogger('theano.tensor.blas') _logger = logging.getLogger('theano.tensor.blas')
...@@ -146,16 +149,17 @@ try: ...@@ -146,16 +149,17 @@ try:
import scipy.linalg.blas import scipy.linalg.blas
_have_fblas = True _have_fblas = True
_blas_gemv_fns = { _blas_gemv_fns = {
numpy.dtype('float32'):scipy.linalg.blas.fblas.sgemv, numpy.dtype('float32'): scipy.linalg.blas.fblas.sgemv,
numpy.dtype('float64'):scipy.linalg.blas.fblas.dgemv, numpy.dtype('float64'): scipy.linalg.blas.fblas.dgemv,
numpy.dtype('complex64'):scipy.linalg.blas.fblas.cgemv, numpy.dtype('complex64'): scipy.linalg.blas.fblas.cgemv,
numpy.dtype('complex128'):scipy.linalg.blas.fblas.zgemv, numpy.dtype('complex128'): scipy.linalg.blas.fblas.zgemv,
} }
except ImportError, e: except ImportError, e:
_have_fblas = False _have_fblas = False
_logger.warning('Failed to import scipy.linalg.blas.fblas. ' _logger.warning('Failed to import scipy.linalg.blas.fblas. '
'Falling back on slower implementations (%s)', str(e)) 'Falling back on slower implementations (%s)', str(e))
class Gemv(Op): class Gemv(Op):
""" """
expression is beta * y + alpha * A x expression is beta * y + alpha * A x
...@@ -166,12 +170,12 @@ class Gemv(Op): ...@@ -166,12 +170,12 @@ class Gemv(Op):
output is a vector that can be inplace on y output is a vector that can be inplace on y
""" """
def __init__(self, inplace): def __init__(self, inplace):
self.inplace=inplace self.inplace = inplace
if inplace: if inplace:
self.destroy_map={0:[0]} self.destroy_map = {0: [0]}
def __eq__(self, other): def __eq__(self, other):
return type(self)==type(other) and self.inplace == other.inplace return type(self) == type(other) and self.inplace == other.inplace
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
...@@ -189,35 +193,44 @@ class Gemv(Op): ...@@ -189,35 +193,44 @@ class Gemv(Op):
alpha = T.as_tensor_variable(alpha) alpha = T.as_tensor_variable(alpha)
beta = T.as_tensor_variable(beta) beta = T.as_tensor_variable(beta)
if y.dtype != A.dtype or y.dtype != x.dtype: if y.dtype != A.dtype or y.dtype != x.dtype:
raise TypeError('Gemv requires matching dtypes', (y.dtype, A.dtype, x.dtype)) raise TypeError('Gemv requires matching dtypes',
if A.ndim != 2: raise TypeError('gemv requires matrix for A', A.type) (y.dtype, A.dtype, x.dtype))
if x.ndim != 1: raise TypeError('gemv requires vector for x', x.type) if A.ndim != 2:
if y.ndim != 1: raise TypeError('gemv requires vector for y', y.type) raise TypeError('gemv requires matrix for A', A.type)
if x.ndim != 1:
raise TypeError('gemv requires vector for x', x.type)
if y.ndim != 1:
raise TypeError('gemv requires vector for y', y.type)
if y.broadcastable[0] != A.broadcastable[0]: if y.broadcastable[0] != A.broadcastable[0]:
raise TypeError('broadcastable mismatch between y and A', (y.type, A.type)) raise TypeError('broadcastable mismatch between y and A',
# The following is not grounds for error (y.type, A.type))
# because as long as sizes are 1 at time of perform() there is no problem # The following is not grounds for error because as long as
# sizes are 1 at time of perform() there is no problem
#if x.broadcastable[0] != A.broadcastable[1]: #if x.broadcastable[0] != A.broadcastable[1]:
#raise TypeError('broadcastable mismatch between x and A', (x.type, A.type)) #raise TypeError('broadcastable mismatch between x and A',
#(x.type, A.type))
return Apply(self, [y, alpha, A, x, beta], [y.type()]) return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage): def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs y, alpha, A, x, beta = inputs
if _have_fblas and y.shape[0]!=0 and x.shape[0]!=0: if _have_fblas and y.shape[0] != 0 and x.shape[0] != 0:
gemv = _blas_gemv_fns[y.dtype] gemv = _blas_gemv_fns[y.dtype]
if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]): if (A.shape[0] != y.shape[0] or A.shape[1] != x.shape[0]):
raise ValueError('Incompatible shapes for gemv ' raise ValueError('Incompatible shapes for gemv '
'(beta * y + alpha * dot(A, x)). y: %s, A: %s, x: %s ' '(beta * y + alpha * dot(A, x)). y: %s, A: %s, x: %s '
% (y.shape, A.shape, x.shape))# % (y.shape, A.shape, x.shape))
#Here I suppose that A is in c order. If we don't make it explicitly #Here I suppose that A is in c order. If we don't make it
# as fortran order, scipy 0.7.2 seam to create a copy in fortran # explicitly as fortran order, scipy 0.7.2 seam to create
# order instead of just reshaping it and using the trans flag. # a copy in fortran order instead of just reshaping it
# and using the trans flag.
#If A is already in fortran order, make it in c order and using the #If A is already in fortran order, make it in c order and using the
# trans flag don't seam to cause slowdown. # trans flag don't seam to cause slowdown.
#out_storage[0][0] = gemv(alpha, A, x, beta, y, overwrite_y=self.inplace) #out_storage[0][0] = gemv(alpha, A, x, beta, y,
out_storage[0][0] = gemv(alpha, A.T, x, beta, y, overwrite_y=self.inplace, trans=True) # overwrite_y=self.inplace)
out_storage[0][0] = gemv(alpha, A.T, x, beta, y,
overwrite_y=self.inplace, trans=True)
else: else:
out = numpy.dot(A, x) out = numpy.dot(A, x)
if alpha != 1: if alpha != 1:
...@@ -231,6 +244,7 @@ class Gemv(Op): ...@@ -231,6 +244,7 @@ class Gemv(Op):
gemv_no_inplace = Gemv(inplace=False) gemv_no_inplace = Gemv(inplace=False)
gemv_inplace = Gemv(inplace=True) gemv_inplace = Gemv(inplace=True)
class Ger(Op): class Ger(Op):
""" """
BLAS defines general rank-1 update GER as A <- A + alpha x y' BLAS defines general rank-1 update GER as A <- A + alpha x y'
...@@ -245,12 +259,13 @@ class Ger(Op): ...@@ -245,12 +259,13 @@ class Ger(Op):
and override the make_thunk() method to use Scipy and C respectively. and override the make_thunk() method to use Scipy and C respectively.
""" """
def __init__(self, destructive): def __init__(self, destructive):
self.destructive=destructive self.destructive = destructive
if destructive: if destructive:
self.destroy_map={0:[0]} self.destroy_map = {0: [0]}
def __eq__(self, other): def __eq__(self, other):
return type(self)==type(other) and self.destructive == other.destructive return (type(self) == type(other) and
self.destructive == other.destructive)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.destructive) return hash(type(self)) ^ hash(self.destructive)
...@@ -299,30 +314,36 @@ class Ger(Op): ...@@ -299,30 +314,36 @@ class Ger(Op):
ger = Ger(destructive=False) ger = Ger(destructive=False)
ger_destructive = Ger(destructive=True) ger_destructive = Ger(destructive=True)
def default_blas_ldflags(): def default_blas_ldflags():
try: try:
# If we are in a EPD installation, mkl is available # If we are in a EPD installation, mkl is available
blas_info = numpy.distutils.__config__.blas_opt_info
if "EPD" in sys.version: if "EPD" in sys.version:
if sys.platform == 'win32': if sys.platform == 'win32':
return ' '.join( return ' '.join(
['-L%s' % os.path.join(sys.prefix, "Scripts")] + ['-L%s' % os.path.join(sys.prefix, "Scripts")] +
# Why on Windows, the library used are not the # Why on Windows, the library used are not the
# same as what is in # same as what is in
# numpy.distutils.__config__.blas_opt_info['libraries']? # blas_info['libraries']?
['-l%s' % l for l in ["mk2_core", "mk2_intel_thread", ['-l%s' % l for l in ["mk2_core", "mk2_intel_thread",
"mk2_rt"]]) "mk2_rt"]])
return ' '.join( return ' '.join(
['-L%s' % os.path.join(sys.prefix, "lib")] + ['-L%s' % os.path.join(sys.prefix, "lib")] +
['-l%s' % l for l in numpy.distutils.__config__.blas_opt_info['libraries']]) ['-l%s' % l for l in blas_info['libraries']])
#if numpy was linked with library that are not installed, we can't reuse them. #if numpy was linked with library that are not installed, we
if all(not os.path.exists(dir) for dir in numpy.distutils.__config__.blas_opt_info['library_dirs']): #can't reuse them.
if all(not os.path.exists(dir) for dir in blas_info['library_dirs']):
return "-lblas" return "-lblas"
return ' '.join( return ' '.join(
#TODO: the Gemm op below should separate the -L and -l arguments into the two callbacks that CLinker uses for that stuff. #TODO: the Gemm op below should separate the
# for now, we just pass the whole ldflags as the -l options part. # -L and -l arguments into the two callbacks
['-L%s'%l for l in numpy.distutils.__config__.blas_opt_info['library_dirs']] + # that CLinker uses for that stuff. for now,
['-l%s'%l for l in numpy.distutils.__config__.blas_opt_info['libraries']]) # we just pass the whole ldflags as the -l
# ['-I%s'%l for l in numpy.distutils.__config__.blas_opt_info['include_dirs']]) # options part.
['-L%s' % l for l in blas_info['library_dirs']] +
['-l%s' % l for l in blas_info['libraries']])
# ['-I%s' % l for l in blas_info['include_dirs']])
except KeyError: except KeyError:
return "-lblas" return "-lblas"
...@@ -330,23 +351,27 @@ AddConfigVar('blas.ldflags', ...@@ -330,23 +351,27 @@ AddConfigVar('blas.ldflags',
"lib[s] to include for [Fortran] level-3 blas implementation", "lib[s] to include for [Fortran] level-3 blas implementation",
StrParam(default_blas_ldflags())) StrParam(default_blas_ldflags()))
@utils.memoize @utils.memoize
def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False): def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):
"""Return a list of libraries against which an Op's object file should be """Return a list of libraries against which an Op's object file should be
linked to benefit from a BLAS implementation. linked to benefit from a BLAS implementation.
Default: ['blas'], but configuration variable config.blas.ldflags overrides this. Default: ['blas'], but configuration variable config.blas.ldflags
overrides this.
""" """
rval = [] rval = []
if libs_dir: if libs_dir:
found_dyn=False found_dyn = False
dirs = [x[2:] for x in config.blas.ldflags.split() if x.startswith('-L')] dirs = [x[2:] for x in config.blas.ldflags.split()
if x.startswith('-L')]
l = ldflags() l = ldflags()
for d in dirs: for d in dirs:
for f in os.listdir(d): for f in os.listdir(d):
if f.endswith('.so') or f.endswith('.dylib') or f.endswith('.dll'): if (f.endswith('.so') or f.endswith('.dylib') or
if any([f.find(ll)>=0 for ll in l]): f.endswith('.dll')):
found_dyn=True if any([f.find(ll) >= 0 for ll in l]):
found_dyn = True
if not found_dyn and dirs: if not found_dyn and dirs:
_logger.warning("We did not found a dynamic library into the " _logger.warning("We did not found a dynamic library into the "
"library_dir of the library we use for blas. If you use " "library_dir of the library we use for blas. If you use "
...@@ -361,17 +386,21 @@ def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False): ...@@ -361,17 +386,21 @@ def ldflags(libs=True, flags=False, libs_dir=False, include_dir=False):
if libs_dir and t1 == 'L': if libs_dir and t1 == 'L':
rval.append(t[2:]) rval.append(t[2:])
elif include_dir and t1 == 'I': elif include_dir and t1 == 'I':
raise ValueError('Include dirs are not used for blas. We disable this as this can hide other headers and this is not wanted.', t) raise ValueError('Include dirs are not used for blas. We disable'
' this as this can hide other headers and this'
' is not wanted.', t)
rval.append(t[2:]) rval.append(t[2:])
elif libs and t1=='l': # example -lmkl elif libs and t1 == 'l': # example -lmkl
rval.append(t[2:]) rval.append(t[2:])
elif flags and t1 not in ['L','I','l']: # example -openmp elif flags and t1 not in ['L', 'I', 'l']: # example -openmp
rval.append(t) rval.append(t)
elif flags and t1 == 'L': elif flags and t1 == 'L':
#to find it when we load the compiled op if the env of the used is not well configured. #to find it when we load the compiled op if the env of the
rval.append('-Wl,-rpath,'+t[2:]) #used is not well configured.
rval.append('-Wl,-rpath,' + t[2:])
return rval return rval
class GemmRelated(Op): class GemmRelated(Op):
"""Base class for Gemm and Dot22 """Base class for Gemm and Dot22
...@@ -379,10 +408,13 @@ class GemmRelated(Op): ...@@ -379,10 +408,13 @@ class GemmRelated(Op):
""" """
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)) return (type(self) == type(other))
def __hash__(self): def __hash__(self):
return hash(type(self)) return hash(type(self))
def __str__(self): def __str__(self):
return self.__class__.__name__ return self.__class__.__name__
def c_support_code(self): def c_support_code(self):
#return cblas_header_text() #return cblas_header_text()
mod_str = """ mod_str = """
...@@ -397,6 +429,7 @@ class GemmRelated(Op): ...@@ -397,6 +429,7 @@ class GemmRelated(Op):
} }
""" """
return blas_header_text() + mod_str return blas_header_text() + mod_str
def c_headers(self): def c_headers(self):
# std.cout doesn't require the '%' symbol to print stuff... # std.cout doesn't require the '%' symbol to print stuff...
# so it works much better with python's string-substitution stuff. # so it works much better with python's string-substitution stuff.
...@@ -670,6 +703,7 @@ class GemmRelated(Op): ...@@ -670,6 +703,7 @@ class GemmRelated(Op):
def build_gemm_version(self): def build_gemm_version(self):
return (12,) return (12,)
class Gemm(GemmRelated): class Gemm(GemmRelated):
"""In-place version of matrix-matrix multiplication (with accumulation): """In-place version of matrix-matrix multiplication (with accumulation):
...@@ -681,14 +715,15 @@ class Gemm(GemmRelated): ...@@ -681,14 +715,15 @@ class Gemm(GemmRelated):
b*z + a*dot(x,y) b*z + a*dot(x,y)
The difference between the two is that the top form is destructive on z, The difference between the two is that the top form is destructive
whereas the bottom form is not. Gemm works in-place on the storage on z, whereas the bottom form is not. Gemm works in-place on the
associated with z, and the L{Variable} returned by Gemm has a storage that storage associated with z, and the L{Variable} returned by Gemm
will be aliased to the storage of the z argument. Because of this in-place has a storage that will be aliased to the storage of the z
computation, an L{Apply} of this op will destroy the L{Variable} z on argument. Because of this in-place computation, an L{Apply} of
which it operates. (See L{DestructiveOps} for an explanation of what this op will destroy the L{Variable} z on which it operates. (See
destroying means in the context of theano graphs. See L{BlasLapackSupport} for L{DestructiveOps} for an explanation of what destroying means in
more optimized linear algebra operations.) the context of theano graphs. See L{BlasLapackSupport} for more
optimized linear algebra operations.)
""" """
E_rank = 'gemm only works for rank 2' E_rank = 'gemm only works for rank 2'
...@@ -698,18 +733,20 @@ class Gemm(GemmRelated): ...@@ -698,18 +733,20 @@ class Gemm(GemmRelated):
E_float = 'gemm requires floating-point dtypes' E_float = 'gemm requires floating-point dtypes'
def __init__(self, inplace): def __init__(self, inplace):
self.__setstate__({'inplace':inplace}) self.__setstate__({'inplace': inplace})
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
def __str__(self): def __str__(self):
if self.inplace: inplace_str = 'inplace' if self.inplace:
else: inplace_str = 'no_inplace' inplace_str = 'inplace'
else:
inplace_str = 'no_inplace'
return '%s{%s}' % (self.__class__.__name__, inplace_str) return '%s{%s}' % (self.__class__.__name__, inplace_str)
def __setstate__(self, dct): def __setstate__(self, dct):
...@@ -727,9 +764,11 @@ class Gemm(GemmRelated): ...@@ -727,9 +764,11 @@ class Gemm(GemmRelated):
def make_node(self, *inputs): def make_node(self, *inputs):
inputs = map(T.as_tensor_variable, inputs) inputs = map(T.as_tensor_variable, inputs)
if len(inputs) != 5: if len(inputs) != 5:
raise TypeError("Wrong number of inputs for %s (expected 5, got %s)" % (self, len(inputs))) raise TypeError(
"Wrong number of inputs for %s (expected 5, got %s)" %
(self, len(inputs)))
z, a, x, y, b = inputs z, a, x, y, b = inputs
zr, xr, yr = [set(view_roots(i)) for i in z,x,y] zr, xr, yr = [set(view_roots(i)) for i in z, x, y]
# TODO: justify / delete # TODO: justify / delete
if zr.intersection(xr): if zr.intersection(xr):
...@@ -767,26 +806,26 @@ class Gemm(GemmRelated): ...@@ -767,26 +806,26 @@ class Gemm(GemmRelated):
if not self.inplace: if not self.inplace:
z = z.copy() # the original z will not be changed z = z.copy() # the original z will not be changed
if z.shape == (): if z.shape == ():
z.itemset(z*a + b*numpy.dot(x,y)) z.itemset(z * a + b * numpy.dot(x, y))
zout[0] = z zout[0] = z
else: else:
if b == 0.0: if b == 0.0:
if a == 1.0: if a == 1.0:
z[:] = numpy.dot(x,y) z[:] = numpy.dot(x, y)
elif a == -1.0: elif a == -1.0:
z[:] = -numpy.dot(x,y) z[:] = -numpy.dot(x, y)
else: else:
z[:] = a * numpy.dot(x,y) z[:] = a * numpy.dot(x, y)
elif b == 1.0: elif b == 1.0:
if a == 1.0: if a == 1.0:
z += numpy.dot(x,y) z += numpy.dot(x, y)
elif a == -1.0: elif a == -1.0:
z -= numpy.dot(x,y) z -= numpy.dot(x, y)
else: else:
z += a * numpy.dot(x,y) z += a * numpy.dot(x, y)
else: else:
z *= b z *= b
z += a * numpy.dot(x,y) z += a * numpy.dot(x, y)
zout[0] = z zout[0] = z
setup_z_Nz_Sz_inplace = """ setup_z_Nz_Sz_inplace = """
...@@ -812,10 +851,12 @@ class Gemm(GemmRelated): ...@@ -812,10 +851,12 @@ class Gemm(GemmRelated):
npy_intp dims[2]; npy_intp dims[2];
dims[0] = %(_z)s->dimensions[0]; dims[0] = %(_z)s->dimensions[0];
dims[1] = %(_z)s->dimensions[1]; dims[1] = %(_z)s->dimensions[1];
%(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims, type_num_%(_z)s); %(_zout)s = (PyArrayObject*)PyArray_SimpleNew(2, dims,
type_num_%(_z)s);
//fprintf(stderr, "Gemm Allocating %%i %%i\\n", dims[0], dims[1]); //fprintf(stderr, "Gemm Allocating %%i %%i\\n", dims[0], dims[1]);
if(!%(_zout)s) { if(!%(_zout)s) {
PyErr_SetString(PyExc_MemoryError, "failed to alloc gemm_no_inplace output"); PyErr_SetString(PyExc_MemoryError,
"failed to alloc gemm_no_inplace output");
%(fail)s %(fail)s
} }
} }
...@@ -853,7 +894,8 @@ class Gemm(GemmRelated): ...@@ -853,7 +894,8 @@ class Gemm(GemmRelated):
} }
else else
{ {
PyErr_SetString(PyExc_AssertionError, "neither float nor double dtype"); PyErr_SetString(PyExc_AssertionError,
"neither float nor double dtype");
%(fail)s %(fail)s
} }
} }
...@@ -880,14 +922,16 @@ class Gemm(GemmRelated): ...@@ -880,14 +922,16 @@ class Gemm(GemmRelated):
#undef REAL #undef REAL
""" """
def c_code(self, node, name, inp, out, sub): #DEBUG def c_code(self, node, name, inp, out, sub):
_z, _a, _x, _y, _b = inp _z, _a, _x, _y, _b = inp
_zout, = out _zout, = out
if node.inputs[0].type.dtype.startswith('complex'): if node.inputs[0].type.dtype.startswith('complex'):
raise utils.MethodNotDefined('%s.c_code' \ raise utils.MethodNotDefined('%s.c_code' \
% self.__class__.__name__) % self.__class__.__name__)
if not config.blas.ldflags: if not config.blas.ldflags:
return super(Gemm, self).c_code(node, name, (_z, _a, _x, _y, _b), (_zout, ), sub) return super(Gemm, self).c_code(node, name,
(_z, _a, _x, _y, _b), (_zout, ),
sub)
full_code = self.build_gemm_call() % dict(locals(), **sub) full_code = self.build_gemm_call() % dict(locals(), **sub)
return full_code return full_code
...@@ -903,6 +947,7 @@ gemm_no_inplace = Gemm(inplace=False) ...@@ -903,6 +947,7 @@ gemm_no_inplace = Gemm(inplace=False)
pprint.assign(gemm_inplace, FunctionPrinter('gemm_inplace')) pprint.assign(gemm_inplace, FunctionPrinter('gemm_inplace'))
pprint.assign(gemm_no_inplace, FunctionPrinter('gemm_no_inplace')) pprint.assign(gemm_no_inplace, FunctionPrinter('gemm_no_inplace'))
def res_is_a(node, op, maxclients=None): def res_is_a(node, op, maxclients=None):
if maxclients is not None: if maxclients is not None:
retval = (len(node.clients) <= maxclients) retval = (len(node.clients) <= maxclients)
...@@ -939,17 +984,21 @@ def _as_scalar(res, dtype=None): ...@@ -939,17 +984,21 @@ def _as_scalar(res, dtype=None):
return rval return rval
def _is_real_matrix(res): def _is_real_matrix(res):
return res.type.dtype in ('float32', 'float64') \ return res.type.dtype in ('float32', 'float64') \
and res.type.ndim == 2 \ and res.type.ndim == 2 \
and res.type.broadcastable[0] == False \ and res.type.broadcastable[0] == False \
and res.type.broadcastable[1] == False #cope with tuple vs. list and res.type.broadcastable[1] == False # cope with tuple vs. list
def _is_real_vector(res): def _is_real_vector(res):
return res.type.dtype in ('float32', 'float64') \ return res.type.dtype in ('float32', 'float64') \
and res.type.ndim == 1 \ and res.type.ndim == 1 \
and res.type.broadcastable[0] == False and res.type.broadcastable[0] == False
def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip=True):
#print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip #print 'BETA L + ALPHA M', beta, L, alpha, M, recurse_flip
#EXPRESSION: (beta * L) + (alpha * M) #EXPRESSION: (beta * L) + (alpha * M)
...@@ -990,7 +1039,6 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): ...@@ -990,7 +1039,6 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
rval = [g.dimshuffle()] rval = [g.dimshuffle()]
return rval return rval
# this is False'd out because of inadequate testing. # this is False'd out because of inadequate testing.
# TODO see ticket #237 # TODO see ticket #237
if False and res_is_a(M, gemm_no_inplace, 1): if False and res_is_a(M, gemm_no_inplace, 1):
...@@ -1000,16 +1048,20 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): ...@@ -1000,16 +1048,20 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
#print 'GEMM', G, L #print 'GEMM', G, L
if res_is_a(G, _dot22, 1): if res_is_a(G, _dot22, 1):
#EXPRESSION: (beta * L) + (alpha * (gemm_no_inplace(dot(x,y), a, u, v, b))) #EXPRESSION: (beta * L) +
# (alpha * (gemm_no_inplace(dot(x,y), a, u, v, b)))
x, y = G.owner.inputs x, y = G.owner.inputs
#EXPRESSION: (beta * L) + (alpha * ((b*dot(x,y) + (a * dot(u, v))))) #EXPRESSION: (beta * L) + (alpha * ((b*dot(x,y) +
#EXPRESSION: (beta * L) + (alpha*b*dot(x,y)) + (alpha * a * dot(u, v)) # (a * dot(u, v)))))
rval = [gemm_no_inplace(gemm_no_inplace(L, alpha * b, x, y, beta), alpha * a, u, v, 1.0)] #EXPRESSION: (beta * L) + (alpha*b*dot(x,y)) +
# (alpha * a * dot(u, v))
rval = [gemm_no_inplace(gemm_no_inplace(L, alpha * b, x, y, beta),
alpha * a, u, v, 1.0)]
return rval return rval
if (G is L): if (G is L):
#EXPRESSION: (beta * L) + (alpha*b*L) + (alpha * a * dot(u, v)) #EXPRESSION: (beta * L) + (alpha*b*L) + (alpha * a * dot(u, v))
rval = [gemm_no_inplace(L, alpha*a, u, v, alpha * b + beta)] rval = [gemm_no_inplace(L, alpha * a, u, v, alpha * b + beta)]
return rval return rval
if (1.0 != alpha): if (1.0 != alpha):
#at the very least, move the alpha inside the gemm_no_inplace #at the very least, move the alpha inside the gemm_no_inplace
...@@ -1017,7 +1069,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True): ...@@ -1017,7 +1069,7 @@ def _beta_L_plus_alpha_M(beta, L, alpha, M, recurse_flip = True):
return rval return rval
if recurse_flip: if recurse_flip:
return _beta_L_plus_alpha_M(alpha, M, beta, L, recurse_flip = False) return _beta_L_plus_alpha_M(alpha, M, beta, L, recurse_flip=False)
else: else:
return False return False
...@@ -1030,18 +1082,19 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -1030,18 +1082,19 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
if scale == -1: if scale == -1:
return -thing return -thing
else: else:
return scale*thing return scale * thing
try: try:
r.type.broadcastable r.type.broadcastable
except Exception: except Exception:
return None return None
if ((r.type.ndim not in (1, 2)) or if ((r.type.ndim not in (1, 2)) or
r.type.dtype not in ('float32', 'float64', 'complex64', 'complex128')): r.type.dtype not in ('float32', 'float64',
'complex64', 'complex128')):
rval.append(scaled(r)) rval.append(scaled(r))
return rval return rval
if maxclients and len(getattr(r,'clients',[])) > maxclients: if maxclients and len(getattr(r, 'clients', [])) > maxclients:
rval.append((scale, r)) rval.append((scale, r))
return rval return rval
...@@ -1074,32 +1127,35 @@ def _gemm_canonicalize(r, scale, rval, maxclients): ...@@ -1074,32 +1127,35 @@ def _gemm_canonicalize(r, scale, rval, maxclients):
matrices.append(i) matrices.append(i)
else: else:
# just put the original arguments as in the base case # just put the original arguments as in the base case
rval.append((scale,r)) rval.append((scale, r))
return rval return rval
if len(matrices)==1: if len(matrices) == 1:
assert len(vectors)==0 assert len(vectors) == 0
m = matrices[0] m = matrices[0]
if len(scalars) == 0: if len(scalars) == 0:
_gemm_canonicalize(m, scale, rval, 1) _gemm_canonicalize(m, scale, rval, 1)
elif len(scalars) == 1: elif len(scalars) == 1:
_gemm_canonicalize(m, scaled(scalars[0]), rval, 1) _gemm_canonicalize(m, scaled(scalars[0]), rval, 1)
else: else:
_gemm_canonicalize(m, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1) _gemm_canonicalize(m, T.mul(scaled(scalars[0]), *scalars[1:]),
elif len(vectors)==1: rval, 1)
assert len(matrices)==0 elif len(vectors) == 1:
assert len(matrices) == 0
v = vectors[0] v = vectors[0]
if len(scalars) == 0: if len(scalars) == 0:
_gemm_canonicalize(v, scale, rval, 1) _gemm_canonicalize(v, scale, rval, 1)
elif len(scalars) == 1: elif len(scalars) == 1:
_gemm_canonicalize(v, scaled(scalars[0]), rval, 1) _gemm_canonicalize(v, scaled(scalars[0]), rval, 1)
else: else:
_gemm_canonicalize(v, T.mul(scaled(scalars[0]), *scalars[1:]), rval, 1) _gemm_canonicalize(v, T.mul(scaled(scalars[0]),
else: #lets not open this up *scalars[1:]), rval, 1)
rval.append((scale,r)) else: # lets not open this up
rval.append((scale, r))
else: else:
rval.append((scale,r)) rval.append((scale, r))
return rval return rval
def _factor_canonicalized(lst): def _factor_canonicalized(lst):
# remove duplicates from canonicalized list # remove duplicates from canonicalized list
...@@ -1116,17 +1172,17 @@ def _factor_canonicalized(lst): ...@@ -1116,17 +1172,17 @@ def _factor_canonicalized(lst):
# except TypeError: # except TypeError:
# print e, type(e) # print e, type(e)
i = 0 i = 0
while i < len(lst)-1: while i < len(lst) - 1:
try: try:
s_i,M_i = lst[i] s_i, M_i = lst[i]
except Exception: except Exception:
i += 1 i += 1
continue continue
j = i+1 j = i + 1
while j < len(lst): while j < len(lst):
try: try:
s_j,M_j = lst[j] s_j, M_j = lst[j]
except Exception: except Exception:
j += 1 j += 1
continue continue
...@@ -1137,9 +1193,10 @@ def _factor_canonicalized(lst): ...@@ -1137,9 +1193,10 @@ def _factor_canonicalized(lst):
del lst[j] del lst[j]
else: else:
j += 1 j += 1
i+=1 i += 1
return lst return lst
def _gemm_from_factored_list(lst): def _gemm_from_factored_list(lst):
"""Returns None, or a list to replace node.outputs """Returns None, or a list to replace node.outputs
""" """
...@@ -1171,7 +1228,7 @@ def _gemm_from_factored_list(lst): ...@@ -1171,7 +1228,7 @@ def _gemm_from_factored_list(lst):
for i in xrange(len(lst) - 1): for i in xrange(len(lst) - 1):
s_i, M_i = lst[i] s_i, M_i = lst[i]
for j in xrange(i+1, len(lst)): for j in xrange(i + 1, len(lst)):
s_j, M_j = lst[j] s_j, M_j = lst[j]
if M_i.type != M_j.type: if M_i.type != M_j.type:
...@@ -1183,15 +1240,19 @@ def _gemm_from_factored_list(lst): ...@@ -1183,15 +1240,19 @@ def _gemm_from_factored_list(lst):
#print 'GOT IT', gemm_of_sM_list #print 'GOT IT', gemm_of_sM_list
if gemm_of_sM_list: if gemm_of_sM_list:
def item_to_var(t): def item_to_var(t):
try: s,M = t try:
except Exception: return t s, M = t
if s == 1: return M except Exception:
if s == -1: return -M return t
return s*M if s == 1:
return M
if s == -1:
return -M
return s * M
assert len(gemm_of_sM_list) == 1 assert len(gemm_of_sM_list) == 1
add_inputs = [item_to_var(input) add_inputs = [item_to_var(input)
for k, input in enumerate(lst) if k not in (i,j)] for k, input in enumerate(lst) if k not in (i, j)]
add_inputs.extend(gemm_of_sM_list) add_inputs.extend(gemm_of_sM_list)
if len(add_inputs) > 1: if len(add_inputs) > 1:
rval = [T.add(*add_inputs)] rval = [T.add(*add_inputs)]
...@@ -1200,11 +1261,14 @@ def _gemm_from_factored_list(lst): ...@@ -1200,11 +1261,14 @@ def _gemm_from_factored_list(lst):
#print "RETURNING GEMM THIGN", rval #print "RETURNING GEMM THIGN", rval
return rval return rval
def _gemm_from_node2(node): def _gemm_from_node2(node):
""" """
:todo: In many expressions, there are many ways to turn it into a gemm. For example :todo: In many expressions, there are many ways to turn it into a
dot(a,b) + c + d. This function should return all of them, so that if one version of gemm gemm. For example dot(a,b) + c + d. This function should
causes a cycle in the graph, then another application of gemm can be tried. return all of them, so that if one version of gemm causes a
cycle in the graph, then another application of gemm can be
tried.
""" """
lst = [] lst = []
...@@ -1214,10 +1278,11 @@ def _gemm_from_node2(node): ...@@ -1214,10 +1278,11 @@ def _gemm_from_node2(node):
lst = _factor_canonicalized(lst) lst = _factor_canonicalized(lst)
rval = _gemm_from_factored_list(lst) rval = _gemm_from_factored_list(lst)
# It can happen that _factor_canonicalized and _gemm_from_factored_list # It can happen that _factor_canonicalized and
# return a node with an incorrect type. This happens in particular when # _gemm_from_factored_list return a node with an incorrect
# one of the scalar factors forces the upcast of the whole expression. # type. This happens in particular when one of the scalar
# In that case, we simply skip that candidate for Gemm. This was # factors forces the upcast of the whole expression. In that
# case, we simply skip that candidate for Gemm. This was
# discussed in # discussed in
# http://groups.google.com/group/theano-dev/browse_thread/thread/a3096c82856e3ad5, # http://groups.google.com/group/theano-dev/browse_thread/thread/a3096c82856e3ad5,
# but never made it into a trac ticket. # but never made it into a trac ticket.
...@@ -1225,6 +1290,7 @@ def _gemm_from_node2(node): ...@@ -1225,6 +1290,7 @@ def _gemm_from_node2(node):
if rval and (rval[0].type == node.outputs[0].type): if rval and (rval[0].type == node.outputs[0].type):
return rval return rval
class GemmOptimizer(Optimizer): class GemmOptimizer(Optimizer):
"""Graph optimizer for inserting Gemm operations""" """Graph optimizer for inserting Gemm operations"""
def __init__(self): def __init__(self):
...@@ -1343,6 +1409,7 @@ class Dot22(GemmRelated): ...@@ -1343,6 +1409,7 @@ class Dot22(GemmRelated):
_dot22 = Dot22() _dot22 = Dot22()
@local_optimizer([T.dot]) @local_optimizer([T.dot])
def local_dot_to_dot22(node): def local_dot_to_dot22(node):
# This works for tensor.outer too because basic.outer is a macro that # This works for tensor.outer too because basic.outer is a macro that
...@@ -1350,10 +1417,11 @@ def local_dot_to_dot22(node): ...@@ -1350,10 +1417,11 @@ def local_dot_to_dot22(node):
if node.op != T.dot: if node.op != T.dot:
return return
x,y = node.inputs x, y = node.inputs
if y.type.dtype != x.type.dtype: if y.type.dtype != x.type.dtype:
# TODO: upcast one so the types match # TODO: upcast one so the types match
_logger.info('Not optimizing dot with inputs %s %s %s %s', x, y, x.type, y.type) _logger.info('Not optimizing dot with inputs %s %s %s %s',
x, y, x.type, y.type)
return return
if y.type.dtype.startswith('float') or y.type.dtype.startswith('complex'): if y.type.dtype.startswith('float') or y.type.dtype.startswith('complex'):
...@@ -1362,15 +1430,18 @@ def local_dot_to_dot22(node): ...@@ -1362,15 +1430,18 @@ def local_dot_to_dot22(node):
return [_dot22(*node.inputs)] return [_dot22(*node.inputs)]
if x.ndim == 2 and y.ndim == 1: if x.ndim == 2 and y.ndim == 1:
#print "local_dot_to_dot22: MV" #print "local_dot_to_dot22: MV"
return [_dot22(x, y.dimshuffle(0,'x')).dimshuffle(0)] return [_dot22(x, y.dimshuffle(0, 'x')).dimshuffle(0)]
if x.ndim == 1 and y.ndim == 2: if x.ndim == 1 and y.ndim == 2:
#print "local_dot_to_dot22: VM" #print "local_dot_to_dot22: VM"
return [_dot22(x.dimshuffle('x',0), y).dimshuffle(1)] return [_dot22(x.dimshuffle('x', 0), y).dimshuffle(1)]
if x.ndim == 1 and y.ndim == 1: if x.ndim == 1 and y.ndim == 1:
#print "local_dot_to_dot22: VV" #print "local_dot_to_dot22: VV"
return [_dot22(x.dimshuffle('x',0), y.dimshuffle(0,'x')).dimshuffle()] return [_dot22(x.dimshuffle('x', 0),
y.dimshuffle(0, 'x')).dimshuffle()]
_logger.info('Not optimizing dot with inputs %s %s %s %s',
x, y, x.type, y.type)
_logger.info('Not optimizing dot with inputs %s %s %s %s', x, y, x.type, y.type)
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
def local_inplace_gemm(node): def local_inplace_gemm(node):
...@@ -1383,11 +1454,13 @@ def local_inplace_gemv(node): ...@@ -1383,11 +1454,13 @@ def local_inplace_gemv(node):
if node.op == gemv_no_inplace: if node.op == gemv_no_inplace:
return [gemv_inplace(*node.inputs)] return [gemv_inplace(*node.inputs)]
@local_optimizer([ger]) @local_optimizer([ger])
def local_inplace_ger(node): def local_inplace_ger(node):
if node.op == ger: if node.op == ger:
return [ger_destructive(*node.inputs)] return [ger_destructive(*node.inputs)]
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
def local_gemm_to_gemv(node): def local_gemm_to_gemv(node):
"""GEMM acting on row or column matrices -> GEMV """GEMM acting on row or column matrices -> GEMV
...@@ -1401,6 +1474,7 @@ def local_gemm_to_gemv(node): ...@@ -1401,6 +1474,7 @@ def local_gemm_to_gemv(node):
r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b) r = gemv_no_inplace(z.dimshuffle(0), a, x, y.dimshuffle(0), b)
return [r.dimshuffle(0, 'x')] return [r.dimshuffle(0, 'x')]
@local_optimizer([gemm_no_inplace]) @local_optimizer([gemm_no_inplace])
def local_gemm_to_ger(node): def local_gemm_to_ger(node):
"""GEMM computing an outer-product -> GER """GEMM computing an outer-product -> GER
...@@ -1481,8 +1555,8 @@ blas_optdb = SequenceDB() ...@@ -1481,8 +1555,8 @@ blas_optdb = SequenceDB()
# run after numerical stability optimizations (1.5) # run after numerical stability optimizations (1.5)
optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run') optdb.register('BlasOpt', blas_optdb, 1.7, 'fast_run')
# run before specialize (2.0) because specialize is basically a free-for-all that makes the # run before specialize (2.0) because specialize is basically a
# graph crazy. # free-for-all that makes the graph crazy.
blas_optdb.register('local_dot_to_dot22', blas_optdb.register('local_dot_to_dot22',
EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5), EquilibriumOptimizer([local_dot_to_dot22], max_use_ratio=5),
...@@ -1502,7 +1576,8 @@ blas_optdb.register('local_gemm_to_gemv', ...@@ -1502,7 +1576,8 @@ blas_optdb.register('local_gemm_to_gemv',
# After destroyhandler is in but before we try to make elemwise things inplace # After destroyhandler is in but before we try to make elemwise things inplace
# Try to make gemm inplace # Try to make gemm inplace
# Also, need to make the gemm optimisation(step 70) happen before the fusion of elemwise(step 71) # Also, need to make the gemm optimisation(step 70) happen before the
# fusion of elemwise(step 71)
blas_opt_inplace = EquilibriumOptimizer( blas_opt_inplace = EquilibriumOptimizer(
[local_inplace_gemm, local_inplace_gemv, local_inplace_ger], [local_inplace_gemm, local_inplace_gemv, local_inplace_ger],
failure_callback=EquilibriumOptimizer.warn_inplace, failure_callback=EquilibriumOptimizer.warn_inplace,
...@@ -1538,7 +1613,7 @@ class Dot22Scalar(GemmRelated): ...@@ -1538,7 +1613,7 @@ class Dot22Scalar(GemmRelated):
bz = [x.type.broadcastable[0], y.type.broadcastable[1]] bz = [x.type.broadcastable[0], y.type.broadcastable[1]]
outputs = [T.tensor(x.type.dtype, bz)] outputs = [T.tensor(x.type.dtype, bz)]
return Apply(self, [x,y,a], outputs) return Apply(self, [x, y, a], outputs)
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, y, scalar = inp x, y, scalar = inp
...@@ -1546,7 +1621,8 @@ class Dot22Scalar(GemmRelated): ...@@ -1546,7 +1621,8 @@ class Dot22Scalar(GemmRelated):
try: try:
z[0] = numpy.asarray(scalar * numpy.dot(x, y)) z[0] = numpy.asarray(scalar * numpy.dot(x, y))
except ValueError, e: except ValueError, e:
# The error raised by numpy has no shape information, we mean to add that # The error raised by numpy has no shape information, we
# mean to add that
e.args = e.args + (x.shape, y.shape) e.args = e.args + (x.shape, y.shape)
raise raise
...@@ -1558,7 +1634,8 @@ class Dot22Scalar(GemmRelated): ...@@ -1558,7 +1634,8 @@ class Dot22Scalar(GemmRelated):
check_ab_double_or_float = """ check_ab_double_or_float = """
if ((%(_a)s->descr->type_num != PyArray_DOUBLE) if ((%(_a)s->descr->type_num != PyArray_DOUBLE)
&& (%(_a)s->descr->type_num != PyArray_FLOAT)) && (%(_a)s->descr->type_num != PyArray_FLOAT))
{PyErr_SetString(PyExc_NotImplementedError, "type(a) is not double or float"); %(fail)s;} {PyErr_SetString(PyExc_NotImplementedError,
"type(a) is not double or float"); %(fail)s;}
""" """
case_float_ab_constants = """ case_float_ab_constants = """
...@@ -1579,14 +1656,15 @@ class Dot22Scalar(GemmRelated): ...@@ -1579,14 +1656,15 @@ class Dot22Scalar(GemmRelated):
double b = 0.0; double b = 0.0;
""" """
def c_code(self, node, name, inp, out, sub): #DEBUG def c_code(self, node, name, inp, out, sub):
_x, _y, _a = inp _x, _y, _a = inp
_zout, = out _zout, = out
if node.inputs[0].type.dtype.startswith('complex'): if node.inputs[0].type.dtype.startswith('complex'):
raise utils.MethodNotDefined('%s.c_code' \ raise utils.MethodNotDefined('%s.c_code' \
% self.__class__.__name__) % self.__class__.__name__)
if len(self.c_libraries()) <= 0: if len(self.c_libraries()) <= 0:
return super(Dot22Scalar, self).c_code(node, name, (_x, _y), (_zout, ), sub) return super(Dot22Scalar, self).c_code(node, name, (_x, _y),
(_zout, ), sub)
full_code = self.build_gemm_call() % dict(locals(), **sub) full_code = self.build_gemm_call() % dict(locals(), **sub)
return full_code return full_code
...@@ -1599,21 +1677,29 @@ class Dot22Scalar(GemmRelated): ...@@ -1599,21 +1677,29 @@ class Dot22Scalar(GemmRelated):
_dot22scalar = Dot22Scalar() _dot22scalar = Dot22Scalar()
@local_optimizer([T.mul]) @local_optimizer([T.mul])
def local_dot22_to_dot22scalar(node): def local_dot22_to_dot22scalar(node):
""" """
:note: we upcast the scalar if after the multiplication with the dot this give the same type. :note: we upcast the scalar if after the multiplication with the
.. note: dot this give the same type.
We execute this optimizer after the gemm optimizer. This allow to give more priority to gemm that give more speed up then this optimizer, but allow the gemm optimizer to ignore this op.
.. note: We execute this optimizer after the gemm optimizer. This
allow to give more priority to gemm that give more speed up
then this optimizer, but allow the gemm optimizer to ignore
this op.
TODO: support when we can reorder the mul to generate a dot22scalar or fix the canonizer to merge them(1 mul with multiple inputs)
TODO: support when we can reorder the mul to generate a
dot22scalar or fix the canonizer to merge them(1 mul with multiple
inputs)
""" """
if node.op != T.mul: if node.op != T.mul:
return False return False
i_dot22 = [x.owner and x.owner.op==_dot22 for x in node.inputs] i_dot22 = [x.owner and x.owner.op == _dot22 for x in node.inputs]
if not any(i_dot22): return False # no dot22 if not any(i_dot22):
if i_dot22.count(True)>1: return False # no dot22
if i_dot22.count(True) > 1:
#TODO: try each of them. #TODO: try each of them.
pass pass
#return False #TODO fix #return False #TODO fix
...@@ -1621,34 +1707,39 @@ def local_dot22_to_dot22scalar(node): ...@@ -1621,34 +1707,39 @@ def local_dot22_to_dot22scalar(node):
d = node.inputs[dot22_idx] d = node.inputs[dot22_idx]
i_scalar = [_as_scalar(x, dtype=d.dtype) for x in node.inputs] i_scalar = [_as_scalar(x, dtype=d.dtype) for x in node.inputs]
if not any(i_scalar): if not any(i_scalar):
i_mul = [x.owner and x.owner.op ==T.mul for x in node.inputs] i_mul = [x.owner and x.owner.op == T.mul for x in node.inputs]
if not any(i_mul): if not any(i_mul):
#no scalar in input and no multiplication #no scalar in input and no multiplication
#if their was a multiplication we couls reorder the graph by the associativity of the graph. #if their was a multiplication we couls reorder the graph
#by the associativity of the graph.
return False return False
#maybe we can reorder the graph as this mul have a mul in input. #maybe we can reorder the graph as this mul have a mul in input.
#The canonizer should have merged those mul together. #The canonizer should have merged those mul together.
#We support only 1 additional level of mul. #We support only 1 additional level of mul.
mul_idx = i_mul.index(True)#we take the first mul! mul_idx = i_mul.index(True) # we take the first mul!
m = node.inputs[mul_idx] m = node.inputs[mul_idx]
if len(m.owner.inputs)==2 and any([_as_scalar(x, dtype=d.dtype) for x in m.owner.inputs]): if len(m.owner.inputs) == 2 and any([_as_scalar(x, dtype=d.dtype)
for x in m.owner.inputs]):
scalar_idx = -1 scalar_idx = -1
for i,x in enumerate(m.owner.inputs): for i, x in enumerate(m.owner.inputs):
if _as_scalar(x, dtype=d.dtype) and (theano.scalar.upcast(x.type.dtype,d.type.dtype) if _as_scalar(x, dtype=d.dtype) and (theano.scalar.upcast(
x.type.dtype, d.type.dtype)
== d.type.dtype): == d.type.dtype):
scalar_idx = i scalar_idx = i
break break
if scalar_idx < 0: if scalar_idx < 0:
_logger.info('Not optimizing dot22 with inputs %s %s, as the type ' _logger.info('Not optimizing dot22 with inputs %s %s, as the'
'of the scalar cannot be upcasted to the matrix type', ' type of the scalar cannot be upcasted to the'
' matrix type',
node.inputs, [x.type for x in node.inputs]) node.inputs, [x.type for x in node.inputs])
return False return False
a = T.cast(_as_scalar(m.owner.inputs[scalar_idx], dtype=d.dtype), d.type.dtype) a = T.cast(_as_scalar(m.owner.inputs[scalar_idx],
dtype=d.dtype), d.type.dtype)
assert not a.type.ndim assert not a.type.ndim
dot=_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a) dot = _dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)
# What about the other inputs to the original node that were # What about the other inputs to the original node that were
# neither part of the dot22 or this mul? # neither part of the dot22 or this mul?
...@@ -1657,7 +1748,7 @@ def local_dot22_to_dot22scalar(node): ...@@ -1657,7 +1748,7 @@ def local_dot22_to_dot22scalar(node):
assert all((i in (dot22_idx, mul_idx)) assert all((i in (dot22_idx, mul_idx))
for i in xrange(len(node.inputs))) for i in xrange(len(node.inputs)))
return [T.mul(m.owner.inputs[1-i],dot)] return [T.mul(m.owner.inputs[1 - i], dot)]
elif m.owner and m.owner.op == T.mul: elif m.owner and m.owner.op == T.mul:
_logger.info('Not optimizing dot22 with inputs %s %s %s %s. ' _logger.info('Not optimizing dot22 with inputs %s %s %s %s. '
'we need to check in a recursive way in the mul if we can ' 'we need to check in a recursive way in the mul if we can '
...@@ -1667,9 +1758,9 @@ def local_dot22_to_dot22scalar(node): ...@@ -1667,9 +1758,9 @@ def local_dot22_to_dot22scalar(node):
return False return False
scalar_idx = -1 scalar_idx = -1
for i,x in enumerate(node.inputs): for i, x in enumerate(node.inputs):
if (i_scalar[i] is not None if (i_scalar[i] is not None
and (theano.scalar.upcast(x.type.dtype,d.type.dtype) and (theano.scalar.upcast(x.type.dtype, d.type.dtype)
== d.type.dtype)): == d.type.dtype)):
scalar_idx = i scalar_idx = i
break break
...@@ -1689,15 +1780,17 @@ def local_dot22_to_dot22scalar(node): ...@@ -1689,15 +1780,17 @@ def local_dot22_to_dot22scalar(node):
if len(o) == 0: if len(o) == 0:
return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)] return [_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a)]
else: else:
return [T.mul(_dot22scalar(d.owner.inputs[0], d.owner.inputs[1], a), *o)] return [T.mul(_dot22scalar(d.owner.inputs[0],
d.owner.inputs[1], a), *o)]
#must happen after gemm as the gemm optimizer don't understant dot22scalar and gemm give more speed up then dot22scalar #must happen after gemm as the gemm optimizer don't understant
#dot22scalar and gemm give more speed up then dot22scalar
blas_optdb.register('local_dot22_to_dot22scalar', blas_optdb.register('local_dot22_to_dot22scalar',
EquilibriumOptimizer([local_dot22_to_dot22scalar ], max_use_ratio=5), EquilibriumOptimizer([local_dot22_to_dot22scalar], max_use_ratio=5),
11, 'fast_run') 11, 'fast_run')
from opt import register_specialize, register_canonicalize #from opt import register_specialize, register_canonicalize
#@register_specialize #@register_specialize
@local_optimizer([]) @local_optimizer([])
def local_print_as_we_go_along(node): def local_print_as_we_go_along(node):
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论