提交 565650e4 authored 作者: abergeron's avatar abergeron

Merge pull request #3311 from bouthilx/sparse_block_dot

Sparse block dot
.. _libdoc_blocksparse:
===================================================================
:mod:`sandbox.blocksparse` -- Block sparse dot operations (gemv and outer)
===================================================================
.. module:: sandbox.blocksparse
:platform: Unix, Windows
:synopsis: Block sparse dot
.. moduleauthor:: LISA
API
===
.. automodule:: theano.sandbox.blocksparse
:members:
import numpy
import theano
from theano import Op, Apply
from theano import tensor
from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined
class SparseBlockGemv(Op):
"""
This op computes the dot product of specified pieces of vectors
and matrices, returning pieces of vectors:
for b in range(batch_size):
for j in range(o.shape[1]):
for i in range(h.shape[1]):
o[b, j, :] += numpy.dot(h[b, i], W[iIdx[b, i], oIdx[b, j]])
where b, h, W, o iIdx, oIdx are defined in the docstring of make_node.
.. image:: ../../images/blocksparse.png
:scale: 50 %
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, W, h, inputIdx, outputIdx):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (batch, oWin, oSize) output vector
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + o[j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower
layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which
blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
o = theano.tensor.as_tensor_variable(o)
W = theano.tensor.as_tensor_variable(W)
h = theano.tensor.as_tensor_variable(h)
inputIdx = theano.tensor.as_tensor_variable(inputIdx)
outputIdx = theano.tensor.as_tensor_variable(outputIdx)
if o.ndim != 3:
raise TypeError('The output o must be a 2D tensor')
if W.ndim != 4:
raise TypeError('The weight matrix W must be a 4D tensor')
if h.ndim != 3:
raise TypeError('The input h must be a 3D tensor')
if inputIdx.ndim != 2:
raise TypeError('The input indices inputIdx must be a 2D tensor')
if outputIdx.ndim != 2:
raise TypeError('The output indices outputIdx must be a 2D tensor')
assert inputIdx.type.dtype in discrete_dtypes
assert outputIdx.type.dtype in discrete_dtypes
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, W, h, inputIdx, outputIdx], [output])
def perform(self, node, inp, out_):
o, W, h, iIdx, oIdx = inp[:5]
if not self.inplace:
o = o.copy()
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
out_[0][0] = o
def grad(self, inputs, grads):
o, W, h, inputIdx, outputIdx = inputs
go = grads[0]
outer_fun = SparseBlockOuter(self.inplace)
gemv_fun = SparseBlockGemv(self.inplace)
Wgrad = outer_fun(W.zeros_like(), h, go, inputIdx, outputIdx)
hgrad = gemv_fun(h.zeros_like(), W.dimshuffle((1, 0, 3, 2)),
go, outputIdx, inputIdx)
return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"),
grad_undefined(self, 4, outputIdx,
"grad of outputIdx makes no sense")]
class SparseBlockOuter(Op):
"""
This computes the outer product of two sets of pieces of vectors
updating a full matrix with the results:
for b in range(batch_size):
o[xIdx[b, i], yIdx[b, j]] += (alpha * outer(x[b, i], y[b, j]))
This op is involved in the gradient of SparseBlockGemv.
"""
registered_opts = []
def __init__(self, inplace=False):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0: [0]}
def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
"""
Compute the dot product of the specified pieces of vectors
and matrices.
Parameters
----------
var: shape, comment
o: (xBlocks, yBlocks, xSize, ySize)
x: (batch, xWin, xSize)
y: (batch, yWin, ySize)
xIdx: (batch, iWin), indexes of the x blocks
yIdx: (batch, oWin), indexes of the y blocks
returns (xBlocks, yBlocks, xSize, ySize), outer(x[i], y[j]) + o[i, j]
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `xBlocks` is the total number of blocks in x.
- `xSize` is the size of each of these x blocks.
- `xWin` is the number of blocks that will be used as x. Which blocks
will be used is specified in `xIdx`.
- `yBlocks` is the number or possible y blocks.
- `ySize` is the size of each of these y blocks.
- `yWin` is the number of y blocks that will actually be computed.
Which blocks will be computed is specified in `yIdx`.
"""
one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
o = theano.tensor.as_tensor_variable(o)
x = theano.tensor.as_tensor_variable(x)
y = theano.tensor.as_tensor_variable(y)
if alpha is None:
alpha = one
output = o.type.__class__(dtype=o.type.dtype,
broadcastable=(False,) * o.ndim)()
return Apply(self, [o, x, y, xIdx, yIdx, alpha],
[output])
def perform(self, node, inp, out_):
o, x, y, xIdx, yIdx, alpha = inp[:6]
if not self.inplace:
o = o.copy()
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i],
y[b, j, :])
out_[0][0] = o
sparse_block_gemv = SparseBlockGemv(False)
sparse_block_gemv_inplace = SparseBlockGemv(True)
sparse_block_outer = SparseBlockOuter(False)
sparse_block_outer_inplace = SparseBlockOuter(True)
def sparse_block_dot(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices. See SparseBlockGemv to get more information.
Parameters
----------
var: shape, comment
W: (iBlocks, oBlocks, iSize, oSize), weight matrix
h: (batch, iWin, iSize), input from lower layer (sparse)
inputIdx: (batch, iWin), indexes of the input blocks
b: (oBlocks, oSize), bias vector
outputIdx: (batch, oWin), indexes of the output blocks
returns (batch, oWin, oSize), dot(W[i, j], h[i]) + b[j]
but b[j] is only added once
Notation
--------
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return SparseBlockGemv()(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
import logging
import numpy import numpy
import theano from theano import Apply, tensor
from theano import Apply, tensor, scalar
from theano.tensor import discrete_dtypes from theano.tensor import discrete_dtypes
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.sandbox.cuda import cuda_available, GpuOp, GpuElemwise from theano.sandbox.cuda import cuda_available, GpuOp
_logger = logging.getLogger('theano.sandbox.cuda.blocksparse')
if cuda_available: if cuda_available:
from theano.sandbox.cuda import (basic_ops, from theano.sandbox.cuda import basic_ops
opt, GpuFromHost,
HostFromGpu, host_from_gpu,
GpuDimShuffle)
from theano.sandbox.cuda.opt_util import alpha_merge, output_merge
class SparseBlockGemvSS(GpuOp): class GpuSparseBlockGemv(GpuOp):
""" """
This op computes the dot product of specified pieces of vectors GPU version of SparseBlockGemv. Check SparseBlockGemv's docstring for more
and matrices, returning pieces of vectors. information.
It computes something like this for each j:
o[j] = sum_over_i(dot(W[i, j], h[i])) + o[j]
The i and j are taken from the inputIdx and outputIdx lists
respectively.
This should not be directly called since the interface is subject This should not be directly called since the interface is subject
to change without notice. Use the sparse_block_dot_SS() function to change without notice. Use the sandbox.blocksparse.sparse_block_dot()
for a stable interface. function for a stable interface.
""" """
def __init__(self, inplace=False): def __init__(self, inplace=False):
...@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp): ...@@ -45,7 +36,7 @@ class SparseBlockGemvSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
def __str__(self): def __str__(self):
return "SparseBlockGemvSS%s" % ("{inplace}" if self.inplace else "") return "GpuSparseBlockGemv%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, W, h, inputIdx, outputIdx): def make_node(self, o, W, h, inputIdx, outputIdx):
o = basic_ops.as_cuda_ndarray_variable(o) o = basic_ops.as_cuda_ndarray_variable(o)
...@@ -340,12 +331,12 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -340,12 +331,12 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
o, W, h, inputIdx, outputIdx = inputs o, W, h, inputIdx, outputIdx = inputs
go = grads[0] go = grads[0]
Wgrad = sparse_block_outer_ss(W.zeros_like(), Wgrad = gpu_sparse_block_outer(W.zeros_like(),
h, go, inputIdx, outputIdx) h, go, inputIdx, outputIdx)
hgrad = sparse_block_gemv_ss(h.zeros_like(), hgrad = gpu_sparse_block_gemv(h.zeros_like(),
W.dimshuffle((1, 0, 3, 2)), W.dimshuffle((1, 0, 3, 2)),
go, go,
outputIdx, inputIdx) outputIdx, inputIdx)
return [go, Wgrad, hgrad, return [go, Wgrad, hgrad,
grad_undefined(self, 3, inputIdx, grad_undefined(self, 3, inputIdx,
"grad of inputIdx makes no sense"), "grad of inputIdx makes no sense"),
...@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -353,25 +344,18 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
"grad of outputIdx makes no sense")] "grad of outputIdx makes no sense")]
sparse_block_gemv_ss = SparseBlockGemvSS(False) gpu_sparse_block_gemv = GpuSparseBlockGemv(False)
sparse_block_gemv_ss_inplace = SparseBlockGemvSS(True) gpu_sparse_block_gemv_inplace = GpuSparseBlockGemv(True)
class SparseBlockOuterSS(GpuOp): class GpuSparseBlockOuter(GpuOp):
""" """
This computes the outer product of two sets of pieces of vectors CPU version of SparseBlockOuter. See SparseBlockOuter's docstring for more
updating a full matrix with the results. information.
It computes something like this:
o[i, j] = (alpha * outer(x[i], y[j])) + o[i, j]
The i and j are taken from the xIdx and yIdx lists respectively.
This op should not be called directly since its interface is This op should not be called directly since its interface is
subject to change without notice. It is involved in the gradient subject to change without notice. It is involved in the gradient
of SparseBlockGemvSS. of GpuSparseBlockGemv. The gradient is not implemented.
""" """
def __init__(self, inplace=False): def __init__(self, inplace=False):
...@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp): ...@@ -386,7 +370,7 @@ class SparseBlockOuterSS(GpuOp):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
def __str__(self): def __str__(self):
return "SparseBlockOuterSS%s" % ("{inplace}" if self.inplace else "") return "GpuSparseBlockOuter%s" % ("{inplace}" if self.inplace else "")
def make_node(self, o, x, y, xIdx, yIdx, alpha=None): def make_node(self, o, x, y, xIdx, yIdx, alpha=None):
one = tensor.constant(numpy.asarray(1.0, dtype='float32')) one = tensor.constant(numpy.asarray(1.0, dtype='float32'))
...@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1], ...@@ -598,8 +582,10 @@ CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_HOST_DIMS(%(y)s)[1],
%(name)s_x_list, %(name)s_x_list,
%(name)s_y_list, %(name)s_y_list,
%(name)s_out_list, %(name)s_out_list,
CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0], CudaNdarray_HOST_STRIDES(%(y)s)[1], CudaNdarray_HOST_STRIDES(%(x)s)[1],
CudaNdarray_DEV_DATA(%(y)s), CudaNdarray_HOST_STRIDES(%(y)s)[0],
CudaNdarray_HOST_STRIDES(%(y)s)[1],
CudaNdarray_DEV_DATA(%(out)s), CudaNdarray_DEV_DATA(%(out)s),
CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
%(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1), %(name)s_xIdx, PyArray_DIM(%(xIdx)s, 1),
...@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1], ...@@ -642,83 +628,5 @@ CudaNdarray_HOST_STRIDES(%(out)s)[0], CudaNdarray_HOST_STRIDES(%(out)s)[1],
return (11,) return (11,)
sparse_block_outer_ss = SparseBlockOuterSS(False) gpu_sparse_block_outer = GpuSparseBlockOuter(False)
sparse_block_outer_ss_inplace = SparseBlockOuterSS(True) gpu_sparse_block_outer_inplace = GpuSparseBlockOuter(True)
if cuda_available:
@opt.register_opt()
@opt.local_optimizer([sparse_block_gemv_ss], inplace=True)
def local_inplace_blocksparse_gemv(node):
if node.op == sparse_block_gemv_ss:
return [sparse_block_gemv_ss_inplace(*node.inputs)]
@opt.register_opt()
@opt.local_optimizer([sparse_block_outer_ss], inplace=True)
def local_inplace_blocksparse_outer(node):
if node.op == sparse_block_outer_ss:
return [sparse_block_outer_ss_inplace(*node.inputs)]
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @opt.register_opt()
# @alpha_merge(SparseBlockOuterSS, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, SparseBlockOuterSS) -> SparseBlockOuterSS(..., alpha=lr)
# """
# return [sparse_block_outer_ss(*inputs)]
# @opt.register_opt()
# @output_merge(SparseBlockOuterSS, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [sparse_block_outer_ss(*inputs)]
def sparse_block_dot_SS(W, h, inputIdx, b, outputIdx):
"""
Compute the dot product (plus bias) of the specified pieces of vectors
and matrices.
Parameters
----------
W : (iBlocks, oBlocks, iSize, oSize)
Weight matrix.
h : (batch, iWin, iSize)
Input from lower layer (sparse).
inputIdx : (batch, iWin)
Indexes of the input blocks.
b : (oBlocks, oSize)
Bias vector.
outputIdx : (batch, oWin)
Indexes of the output blocks.
Returns
-------
(batch, oWin, oSize)
dot(W[i, j], h[i]) + b[j], but b[j] is only added once.
Notes
-----
- `batch` is the number of examples in a minibatch (batch size).
- `iBlocks` is the total number of blocks in the input (from lower layer).
- `iSize` is the size of each of these input blocks.
- `iWin` is the number of blocks that will be used as inputs. Which blocks
will be used is specified in `inputIdx`.
- `oBlocks` is the number or possible output blocks.
- `oSize` is the size of each of these output blocks.
- `oWin` is the number of output blocks that will actually be computed.
Which blocks will be computed is specified in `outputIdx`.
"""
assert inputIdx.ndim == h.ndim - 1
assert outputIdx.ndim == inputIdx.ndim
if h.ndim == 2:
h = h.dimshuffle('x', 0, 1)
inputIdx = inputIdx.dimshuffle('x', 0)
outputIdx = outputIdx.dimshuffle('x', 0)
return sparse_block_gemv_ss(b.take(outputIdx, axis=0), W, h,
inputIdx, outputIdx)
...@@ -220,7 +220,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp): ...@@ -220,7 +220,8 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuOp):
# return () # return ()
return (4,) return (4,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = \
GpuCrossentropySoftmaxArgmax1HotWithBias()
class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp): class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
...@@ -391,7 +392,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp): ...@@ -391,7 +392,8 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuOp):
} }
""" % locals() """ % locals()
gpu_crossentropy_softmax_1hot_with_bias_dx = GpuCrossentropySoftmax1HotWithBiasDx() gpu_crossentropy_softmax_1hot_with_bias_dx = \
GpuCrossentropySoftmax1HotWithBiasDx()
class GpuSoftmax(GpuOp): class GpuSoftmax(GpuOp):
......
...@@ -18,7 +18,7 @@ import theano.ifelse ...@@ -18,7 +18,7 @@ import theano.ifelse
from six.moves import reduce, xrange from six.moves import reduce, xrange
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, toolbox) Optimizer, TopoOptimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
...@@ -31,10 +31,10 @@ from theano.sandbox.cuda.basic_ops import ( ...@@ -31,10 +31,10 @@ from theano.sandbox.cuda.basic_ops import (
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, from theano.sandbox.cuda.blas import (
gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv, gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace, gpu_gemm_no_inplace, GpuConv,
GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights, GpuCorrMM, GpuCorrMM_gradInputs, GpuCorrMM_gradWeights,
GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights) GpuCorr3dMM, GpuCorr3dMM_gradInputs, GpuCorr3dMM_gradWeights)
from theano.sandbox.cuda.blas import gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_inplace
from theano.sandbox.cuda.cula import gpu_solve from theano.sandbox.cuda.cula import gpu_solve
...@@ -42,13 +42,22 @@ from theano.sandbox.cuda.cula import gpu_solve ...@@ -42,13 +42,22 @@ from theano.sandbox.cuda.cula import gpu_solve
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace
from theano.sandbox.cuda.blas import gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_no_inplace
from theano.sandbox.cuda.blas import (GpuDownsampleFactorMax, from theano.sandbox.cuda.blas import (
GpuDownsampleFactorMaxGrad, GpuDownsampleFactorMaxGradGrad) GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad,
GpuDownsampleFactorMaxGradGrad)
from theano.sandbox.blocksparse import SparseBlockGemv, SparseBlockOuter
from theano.sandbox.cuda.blocksparse import (
GpuSparseBlockGemv,
GpuSparseBlockOuter,
gpu_sparse_block_gemv_inplace,
gpu_sparse_block_outer_inplace)
from theano.sandbox.cuda.nnet import ( from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmaxArgmax1HotWithBias,
GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmax1HotWithBiasDx,
GpuSoftmax, GpuSoftmaxWithBias) GpuSoftmax, GpuSoftmaxWithBias)
from theano.sandbox.cuda.elemwise import SupportCodeError from theano.sandbox.cuda.elemwise import SupportCodeError
from theano.scalar.basic_scipy import Erfinv from theano.scalar.basic_scipy import Erfinv
...@@ -77,10 +86,11 @@ except ImportError: ...@@ -77,10 +86,11 @@ except ImportError:
gpu_cut_copies = EquilibriumDB() gpu_cut_copies = EquilibriumDB()
gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1, gpu_seqopt.register('gpu_local_optimizations', gpu_optimizer, 1,
'fast_run', 'fast_compile', 'inplace', 'gpu') 'fast_run', 'fast_compile', 'inplace', 'gpu')
gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2, gpu_seqopt.register('gpu_cut_transfers', gpu_cut_copies, 2,
'fast_run', 'fast_compile', 'gpu') 'fast_run', 'fast_compile', 'gpu')
# DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS enable the GPU! # DO NOT PUT fast_run or fast_compile in gpu_opt! This will ALWAYS
# enable the GPU!
optdb.register('gpu_opt', optdb.register('gpu_opt',
gpu_seqopt, gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1, optdb.__position__.get('add_destroy_handler', 49.5) - 1,
...@@ -266,8 +276,8 @@ def local_gpu_elemwise_0(node): ...@@ -266,8 +276,8 @@ def local_gpu_elemwise_0(node):
'uint16']) 'uint16'])
# case 1 - all inputs are already float32 # case 1 - all inputs are already float32
if all([i.type.dtype == 'float32' for i in node.inputs]): if all([i.type.dtype == 'float32' for i in node.inputs]):
# TODO: change this when fusion makes Elemwise with multiple # TODO: change this when fusion makes Elemwise with
# outputs # multiple outputs
gpu_elemwise = new_op(*(gpu_from_host(i) gpu_elemwise = new_op(*(gpu_from_host(i)
for i in node.inputs)) for i in node.inputs))
# case 2 - it is still ok if some inputs were upcast to float32 # case 2 - it is still ok if some inputs were upcast to float32
...@@ -346,8 +356,8 @@ def local_gpu_split(node): ...@@ -346,8 +356,8 @@ def local_gpu_split(node):
any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx any([c != 'output' and isinstance(c.op, GpuFromHost) for c, idx
in outs_clients])): in outs_clients])):
new_op = GpuSplit(node.op.len_splits) new_op = GpuSplit(node.op.len_splits)
split_res = new_op(as_cuda_ndarray_variable(input), *node.inputs[1:], split_res = new_op(as_cuda_ndarray_variable(input),
return_list=True) *node.inputs[1:], return_list=True)
return [host_from_gpu(o) for o in split_res] return [host_from_gpu(o) for o in split_res]
return False return False
...@@ -374,7 +384,8 @@ def local_gpu_dimshuffle_0(node): ...@@ -374,7 +384,8 @@ def local_gpu_dimshuffle_0(node):
dimshuffle_node = host_input.owner dimshuffle_node = host_input.owner
new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable, new_op = GpuDimShuffle(dimshuffle_node.op.input_broadcastable,
dimshuffle_node.op.new_order) dimshuffle_node.op.new_order)
return [new_op(as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))] return [new_op(
as_cuda_ndarray_variable(dimshuffle_node.inputs[0]))]
return False return False
...@@ -389,8 +400,8 @@ def local_gpu_specifyShape_0(node): ...@@ -389,8 +400,8 @@ def local_gpu_specifyShape_0(node):
if isinstance(node.op, tensor.SpecifyShape): if isinstance(node.op, tensor.SpecifyShape):
input = node.inputs[0] input = node.inputs[0]
if input.owner and isinstance(input.owner.op, HostFromGpu): if input.owner and isinstance(input.owner.op, HostFromGpu):
return [host_from_gpu(tensor.specify_shape(as_cuda_ndarray_variable(input), return [host_from_gpu(tensor.specify_shape(
*node.inputs[1:]))] as_cuda_ndarray_variable(input), *node.inputs[1:]))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
...@@ -467,11 +478,15 @@ def local_gpu_dot_to_dot22(node): ...@@ -467,11 +478,15 @@ def local_gpu_dot_to_dot22(node):
shape_out))] shape_out))]
return False return False
@local_optimizer(None) @local_optimizer(None)
def local_assert_no_cpu_op(node): def local_assert_no_cpu_op(node):
if not isinstance(node.op, GpuOp) and all([var.owner and isinstance(var.owner.op, if (not isinstance(node.op, GpuOp) and
HostFromGpu) for var in node.inputs]) and any([[c for c in var.clients all([var.owner and isinstance(var.owner.op, HostFromGpu)
if isinstance(c[0].op, GpuFromHost)] for var in node.outputs]): for var in node.inputs]) and
any([[c for c in var.clients if isinstance(c[0].op, GpuFromHost)]
for var in node.outputs])):
if config.assert_no_cpu_op == "warn": if config.assert_no_cpu_op == "warn":
_logger.warning(("CPU op %s is detected in the computational" _logger.warning(("CPU op %s is detected in the computational"
" graph") % node) " graph") % node)
...@@ -492,7 +507,7 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2) ...@@ -492,7 +507,7 @@ theano.compile.optdb.register('assert_no_cpu_op', assert_no_cpu_op, 49.2)
@register_opt() @register_opt()
@local_optimizer([theano.ifelse.IfElse, gpu_from_host]) @local_optimizer([theano.ifelse.IfElse, gpu_from_host])
def local_gpu_lazy_ifelse(node): def local_gpu_lazy_ifelse(node):
""" """
gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host) gpu_from_host(ifelse) -> gpu_ifelse(gpu_from_host)
ifelse(host_from_gpu) -> host_from_gpu(ifelse) ifelse(host_from_gpu) -> host_from_gpu(ifelse)
...@@ -572,7 +587,8 @@ def local_gpu_dot22(node): ...@@ -572,7 +587,8 @@ def local_gpu_dot22(node):
if host_input.owner and isinstance(host_input.owner.op, if host_input.owner and isinstance(host_input.owner.op,
tensor.blas.Dot22): tensor.blas.Dot22):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_dot22(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))] return [gpu_dot22(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, tensor.blas.Dot22): if isinstance(node.op, tensor.blas.Dot22):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -597,7 +613,8 @@ def local_gpu_dot22scalar(node): ...@@ -597,7 +613,8 @@ def local_gpu_dot22scalar(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
tensor.blas.Dot22Scalar)): tensor.blas.Dot22Scalar)):
x, y, scalar = host_input.owner.inputs x, y, scalar = host_input.owner.inputs
return [gpu_dot22scalar(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), return [gpu_dot22scalar(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y),
tensor.blas._as_scalar(scalar))] tensor.blas._as_scalar(scalar))]
if isinstance(node.op, tensor.blas.Dot22Scalar): if isinstance(node.op, tensor.blas.Dot22Scalar):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
...@@ -625,7 +642,8 @@ def local_gpu_solve(node): ...@@ -625,7 +642,8 @@ def local_gpu_solve(node):
isinstance(host_input.owner.op, isinstance(host_input.owner.op,
slinalg.Solve)): slinalg.Solve)):
x, y = host_input.owner.inputs x, y = host_input.owner.inputs
return [gpu_solve(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y))] return [gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y))]
if isinstance(node.op, slinalg.Solve): if isinstance(node.op, slinalg.Solve):
if any([i.owner and isinstance(i.owner.op, HostFromGpu) if any([i.owner and isinstance(i.owner.op, HostFromGpu)
...@@ -633,7 +651,7 @@ def local_gpu_solve(node): ...@@ -633,7 +651,7 @@ def local_gpu_solve(node):
x, y = node.inputs x, y = node.inputs
return [host_from_gpu( return [host_from_gpu(
gpu_solve(as_cuda_ndarray_variable(x), gpu_solve(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y)))] as_cuda_ndarray_variable(y)))]
return False return False
...@@ -648,7 +666,7 @@ def local_gpu_gemv(node): ...@@ -648,7 +666,7 @@ def local_gpu_gemv(node):
""" """
gemvs = (tensor.blas.Gemv, gemvs = (tensor.blas.Gemv,
tensor.blas_c.CGemv, tensor.blas_c.CGemv,
) )
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if host_input.owner and isinstance(host_input.owner.op, gemvs): if host_input.owner and isinstance(host_input.owner.op, gemvs):
...@@ -688,7 +706,7 @@ def local_gpu_ger(node): ...@@ -688,7 +706,7 @@ def local_gpu_ger(node):
gers = (tensor.blas_c.CGer, gers = (tensor.blas_c.CGer,
tensor.blas.Ger, tensor.blas.Ger,
tensor.blas_scipy.ScipyGer, tensor.blas_scipy.ScipyGer,
) )
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
...@@ -711,8 +729,7 @@ def local_gpu_ger(node): ...@@ -711,8 +729,7 @@ def local_gpu_ger(node):
as_cuda_ndarray_variable(z), as_cuda_ndarray_variable(z),
a, a,
as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y) as_cuda_ndarray_variable(y)))]
))]
return False return False
...@@ -741,11 +758,12 @@ def local_gpu_gemm(node): ...@@ -741,11 +758,12 @@ def local_gpu_gemm(node):
y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu)) y_on_gpu = (y.owner and isinstance(y.owner.op, HostFromGpu))
z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu)) z_on_gpu = (z.owner and isinstance(z.owner.op, HostFromGpu))
if x_on_gpu or y_on_gpu or z_on_gpu: if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm_no_inplace(as_cuda_ndarray_variable(z), return [host_from_gpu(gpu_gemm_no_inplace(
a, as_cuda_ndarray_variable(z),
as_cuda_ndarray_variable(x), a,
as_cuda_ndarray_variable(y), as_cuda_ndarray_variable(x),
b))] as_cuda_ndarray_variable(y),
b))]
return False return False
...@@ -882,8 +900,8 @@ def local_gpu_elemwise_careduce(node): ...@@ -882,8 +900,8 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result # operation with some reduction pattern will probably result
# to slow down. # to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr) isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)] return [GpuCAReduce(op.reduce_mask, op.scalar_op, scal.basic.sqr)(inp)]
...@@ -898,7 +916,8 @@ def local_gpu_reshape(node): ...@@ -898,7 +916,8 @@ def local_gpu_reshape(node):
isinstance(host_input.owner.op, tensor.Reshape): isinstance(host_input.owner.op, tensor.Reshape):
rshp = host_input.owner.op rshp = host_input.owner.op
x, shp = host_input.owner.inputs x, shp = host_input.owner.inputs
gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x), shp) gpu_reshape = GpuReshape(rshp.ndim)(as_cuda_ndarray_variable(x),
shp)
if gpu_reshape.broadcastable != node.outputs[0].broadcastable: if gpu_reshape.broadcastable != node.outputs[0].broadcastable:
# this can happen as we always return False for all broadcast # this can happen as we always return False for all broadcast
# dim in GpuReshape but not for Reshape # dim in GpuReshape but not for Reshape
...@@ -957,23 +976,27 @@ def local_gpu_subtensor(node): ...@@ -957,23 +976,27 @@ def local_gpu_subtensor(node):
# to the GPU in that case. # to the GPU in that case.
return return
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x), *coords)] return [GpuSubtensor(subt.idx_list)(as_cuda_ndarray_variable(x),
*coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and if (x.owner and
isinstance(x.owner.op, HostFromGpu) and isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"): x.dtype == "float32"):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and
isinstance(gpu_x.owner.op, GpuFromHost) and isinstance(gpu_x.owner.op, GpuFromHost) and
# And it is a shared var or an input of the graph. # And it is a shared var or an input of the graph.
not gpu_x.owner.inputs[0].owner): not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or isinstance(n.op, GpuOp) if any([n == 'output' or isinstance(n.op, GpuOp)
for n, _ in node.outputs[0].clients]): for n, _ in node.outputs[0].clients]):
return return
else: else:
return [host_from_gpu(as_cuda_ndarray_variable(node.outputs[0]))] return [host_from_gpu(as_cuda_ndarray_variable(
node.outputs[0]))]
return return
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
...@@ -992,11 +1015,13 @@ def local_gpu_advanced_subtensor1(node): ...@@ -992,11 +1015,13 @@ def local_gpu_advanced_subtensor1(node):
host_input.owner.op.__class__ is tensor.AdvancedSubtensor1: host_input.owner.op.__class__ is tensor.AdvancedSubtensor1:
x = host_input.owner.inputs[0] x = host_input.owner.inputs[0]
coords = host_input.owner.inputs[1:] coords = host_input.owner.inputs[1:]
return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x), *coords)] return [GpuAdvancedSubtensor1()(as_cuda_ndarray_variable(x),
*coords)]
if node.op.__class__ is tensor.AdvancedSubtensor1: if node.op.__class__ is tensor.AdvancedSubtensor1:
x = node.inputs[0] x = node.inputs[0]
coords = node.inputs[1:] coords = node.inputs[1:]
if x.owner and isinstance(x.owner.op, HostFromGpu) and x.dtype == "float32": if (x.owner and isinstance(x.owner.op, HostFromGpu) and
x.dtype == "float32"):
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))] return [host_from_gpu(GpuAdvancedSubtensor1()(gpu_x, *coords))]
return False return False
...@@ -1027,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1027,12 +1052,14 @@ def local_gpu_advanced_incsubtensor1(node):
if (compute_capability < 2 or if (compute_capability < 2 or
x.ndim != 2 or x.ndim != 2 or
y.ndim != 2): y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else: else:
gpu_op = GpuAdvancedIncSubtensor1_dev20( gpu_op = GpuAdvancedIncSubtensor1_dev20(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
return [gpu_op(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(y), *coords)] return [gpu_op(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1 # Should not execute for GpuAdvancedIncSubtensor1
if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and if (node.op.__class__ is tensor.AdvancedIncSubtensor1 and
...@@ -1183,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -1183,7 +1210,7 @@ def local_gpu_pdbbreakpoint_op(node):
nb_monitored_vars = len(node.outputs) nb_monitored_vars = len(node.outputs)
for i in range(nb_monitored_vars): for i in range(nb_monitored_vars):
inp = old_inputs[i+1] inp = old_inputs[i + 1]
out = old_outputs[i] out = old_outputs[i]
input_is_from_gpu = (inp.owner and input_is_from_gpu = (inp.owner and
...@@ -1248,18 +1275,17 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node): ...@@ -1248,18 +1275,17 @@ def local_gpu_crossentorpy_softmax_argmax_1hot_with_bias(node):
# thing if we want, since this gpu op will cast to integers # thing if we want, since this gpu op will cast to integers
# internally anyway # internally anyway
int_cast_ops = ( int_cast_ops = (
tensor.basic._convert_to_int32, tensor.basic._convert_to_int32,
tensor.basic._convert_to_int8, tensor.basic._convert_to_int8,
tensor.basic._convert_to_int16, tensor.basic._convert_to_int16,
tensor.basic._convert_to_int64, tensor.basic._convert_to_int64)
)
while y.owner and y.owner.op in int_cast_ops: while y.owner and y.owner.op in int_cast_ops:
y = y.owner.inputs[0] y = y.owner.inputs[0]
gpu_nll, gpu_sm, gpu_am = \ gpu_nll, gpu_sm, gpu_am = \
GpuCrossentropySoftmaxArgmax1HotWithBias()( GpuCrossentropySoftmaxArgmax1HotWithBias()(
gpu_x, gpu_x,
as_cuda_ndarray_variable(b), as_cuda_ndarray_variable(b),
as_cuda_ndarray_variable(cast(y, 'float32'))) as_cuda_ndarray_variable(cast(y, 'float32')))
am_dtype = node.outputs[2].type.dtype am_dtype = node.outputs[2].type.dtype
return [host_from_gpu(gpu_nll), return [host_from_gpu(gpu_nll),
host_from_gpu(gpu_sm), host_from_gpu(gpu_sm),
...@@ -1302,7 +1328,8 @@ def local_gpu_softmax_with_bias(node): ...@@ -1302,7 +1328,8 @@ def local_gpu_softmax_with_bias(node):
x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu) x_on_gpu = x.owner and isinstance(x.owner.op, HostFromGpu)
b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu) b_on_gpu = b.owner and isinstance(b.owner.op, HostFromGpu)
if x_on_gpu or b_on_gpu: if x_on_gpu or b_on_gpu:
gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x), as_cuda_ndarray_variable(b)) gpu_sm = GpuSoftmaxWithBias()(as_cuda_ndarray_variable(x),
as_cuda_ndarray_variable(b))
return [host_from_gpu(gpu_sm)] return [host_from_gpu(gpu_sm)]
return False return False
...@@ -1319,6 +1346,7 @@ def _gpu_conv_to_fftconv(node): ...@@ -1319,6 +1346,7 @@ def _gpu_conv_to_fftconv(node):
if (node.op.imshp is not None and if (node.op.imshp is not None and
node.op.imshp[-1] is not None and node.op.imshp[-1] is not None and
node.op.imshp[-1] % 2 == 1): node.op.imshp[-1] % 2 == 1):
kwargs['pad_last_dim'] = True kwargs['pad_last_dim'] = True
# If the user supplied the full nonsymbolic image_shape and # If the user supplied the full nonsymbolic image_shape and
# filter_shape in conv2d(), we can pass it on to conv2d_fft(). # filter_shape in conv2d(), we can pass it on to conv2d_fft().
...@@ -1332,7 +1360,8 @@ def _gpu_conv_to_fftconv(node): ...@@ -1332,7 +1360,8 @@ def _gpu_conv_to_fftconv(node):
(node.op.nkern is not None) and (node.op.nkern is not None) and
(len(node.op.imshp) == 3) and (len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)): (node.op.imshp[0] is not None)):
kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + node.op.kshp kwargs['filter_shape'] = (node.op.nkern, node.op.imshp[0]) + \
node.op.kshp
rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs) rval = conv2d_fft(node.inputs[0], node.inputs[1], **kwargs)
if node.outputs[0].broadcastable != rval.broadcastable: if node.outputs[0].broadcastable != rval.broadcastable:
# With given shape information, conv2d_fft may return a different # With given shape information, conv2d_fft may return a different
...@@ -1348,6 +1377,7 @@ def local_conv_fft_valid(node): ...@@ -1348,6 +1377,7 @@ def local_conv_fft_valid(node):
if (node.op.border_mode == 'valid' and if (node.op.border_mode == 'valid' and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1) and
node.op.fft_opt): node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return False return False
...@@ -1358,6 +1388,7 @@ def local_conv_fft_full(node): ...@@ -1358,6 +1388,7 @@ def local_conv_fft_full(node):
if (node.op.border_mode == 'full' and if (node.op.border_mode == 'full' and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1) and
node.op.fft_opt): node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return return
...@@ -1396,19 +1427,19 @@ def local_gpu_conv(node): ...@@ -1396,19 +1427,19 @@ def local_gpu_conv(node):
# print op.kshp, op.imshp[1:3] # print op.kshp, op.imshp[1:3]
# print op.kshp_logical, logical_img_hw # print op.kshp_logical, logical_img_hw
ret = GpuConv(border_mode=op.out_mode, ret = GpuConv(border_mode=op.out_mode,
subsample=(op.dx, op.dy), subsample=(op.dx, op.dy),
logical_img_hw=logical_img_hw, logical_img_hw=logical_img_hw,
logical_kern_hw=op.kshp_logical, logical_kern_hw=op.kshp_logical,
logical_kern_align_top=op.kshp_logical_top_aligned, logical_kern_align_top=op.kshp_logical_top_aligned,
kshp=op.kshp, kshp=op.kshp,
version=op.version, version=op.version,
direction_hint=op.direction_hint, direction_hint=op.direction_hint,
verbose=op.verbose, verbose=op.verbose,
imshp=op.imshp, imshp=op.imshp,
nkern=op.nkern, nkern=op.nkern,
bsize=op.bsize, bsize=op.bsize,
fft_opt=op.fft_opt fft_opt=op.fft_opt
) )
if op.imshp_logical is not None: if op.imshp_logical is not None:
logical_img_hw = op.imshp_logical[1:3] logical_img_hw = op.imshp_logical[1:3]
if logical_img_hw != op.imshp[1:3]: if logical_img_hw != op.imshp[1:3]:
...@@ -1471,6 +1502,7 @@ def local_gpu_conv(node): ...@@ -1471,6 +1502,7 @@ def local_gpu_conv(node):
def local_conv_gemm(node): def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and if (isinstance(node.op, GpuConv) and
node.op.border_mode in ['full', 'valid']): node.op.border_mode in ['full', 'valid']):
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
subsample = node.op.subsample subsample = node.op.subsample
...@@ -1494,7 +1526,7 @@ def local_conv_gemm(node): ...@@ -1494,7 +1526,7 @@ def local_conv_gemm(node):
# we know the kernel and output size # we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1] prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) * prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
(node.op.imshp[-1] - node.op.kshp[1] + 1)) (node.op.imshp[-1] - node.op.kshp[1] + 1))
if ((node.op.bsize is not None) and if ((node.op.bsize is not None) and
(len(node.op.imshp) == 3) and (len(node.op.imshp) == 3) and
(node.op.imshp[0] is not None)): (node.op.imshp[0] is not None)):
...@@ -1516,7 +1548,7 @@ def local_conv_gemm(node): ...@@ -1516,7 +1548,7 @@ def local_conv_gemm(node):
kern = kern.dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs # call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)( rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img)) gpu_contiguous(kern), gpu_contiguous(img))
if node.outputs[0].broadcastable != rval.broadcastable: if node.outputs[0].broadcastable != rval.broadcastable:
# With given shape information, conv2d_fft may return a different # With given shape information, conv2d_fft may return a different
# broadcast pattern than GpuConv. This is forbidden, so we fix it. # broadcast pattern than GpuConv. This is forbidden, so we fix it.
...@@ -1594,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer): ...@@ -1594,10 +1626,12 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
if ((var in inputs) and if ((var in inputs) and
(shape is not None) and (shape is not None) and
not any(s is None for s in shape)): not any(s is None for s in shape)):
result[var] = theano.shared( result[var] = theano.shared(
# TODO: Use var.type.filter when cuda_ndarray.filter supports non-strict casts # TODO: Use var.type.filter when cuda_ndarray.filter
# var.type.filter(numpy.random.randn(*shape), # supports non-strict casts
# allow_downcast=True), # var.type.filter(numpy.random.randn(*shape),
# allow_downcast=True),
numpy.require(numpy.random.randn(*shape), numpy.require(numpy.random.randn(*shape),
dtype=var.dtype), dtype=var.dtype),
var.name, var.name,
...@@ -1608,10 +1642,11 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer): ...@@ -1608,10 +1642,11 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
# We just register all optimizers from conv_groupopt with the metaoptimizer # We just register all optimizers from conv_groupopt with the metaoptimizer
conv_metaopt = ConvMetaOptimizer( conv_metaopt = ConvMetaOptimizer(
conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts) conv_groupopt.query(*['+' + name for name in conv_groupopt._names]).opts)
# Then we add some optimizers that try less obvious options # Then we add some optimizers that try less obvious options
conv_metaopt.register(dnn.local_conv_dnn_alternative) conv_metaopt.register(dnn.local_conv_dnn_alternative)
# Finally, we register the metaoptimizer as the first optimizer in conv_groupopt # Finally, we register the metaoptimizer as the first optimizer in
# conv_groupopt
conv_groupopt.register('conv_meta', conv_metaopt, 0) conv_groupopt.register('conv_meta', conv_metaopt, 0)
...@@ -1656,6 +1691,7 @@ def local_convgrad3d_fft(node): ...@@ -1656,6 +1691,7 @@ def local_convgrad3d_fft(node):
return False return False
if (isinstance(node.op, ConvGrad3D) and if (isinstance(node.op, ConvGrad3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
# Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t) # Shuffle inputs signal from (b, 0, 1, t, ic) to (ic, b, 0, 1, t)
...@@ -1742,8 +1778,8 @@ def local_convgrad3d_gemm(node): ...@@ -1742,8 +1778,8 @@ def local_convgrad3d_gemm(node):
f = node.inputs[3] f = node.inputs[3]
f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3)) f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(x, f, rval = GpuCorr3dMM_gradWeights(subsample=(sx, sy, sz))(
shape=node.inputs[2][1:4]) x, f, shape=node.inputs[2][1:4])
# Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic) # Shuffle from (ic, oc, 0, 1, t) to (oc, 0, 1, t, ic)
return [rval.dimshuffle(0, 2, 3, 4, 1)] return [rval.dimshuffle(0, 2, 3, 4, 1)]
...@@ -1765,7 +1801,8 @@ def local_convtransp3d_gemm(node): ...@@ -1765,7 +1801,8 @@ def local_convtransp3d_gemm(node):
# Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t) # Shuffle dCdH from (b, 0, 1, t, oc) to (b, oc, 0, 1, t)
f = node.inputs[3] f = node.inputs[3]
f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3)) f = gpu_contiguous(f.dimshuffle(0, 4, 1, 2, 3))
rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x, topgrad=f) rval = GpuCorr3dMM_gradInputs(subsample=(sx, sy, sz))(kern=x,
topgrad=f)
# Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic) # Shuffle from (ic, b, 0, 1, t) to (b, 0, 1, t, ic)
return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]] return [rval.dimshuffle(0, 2, 3, 4, 1) + node.inputs[1]]
...@@ -1781,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample ...@@ -1781,6 +1818,7 @@ import theano.tensor.signal.downsample as downsample
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if (isinstance(node.op, downsample.DownsampleFactorMax) if (isinstance(node.op, downsample.DownsampleFactorMax)
and node.op.ds == node.op.st): and node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if node.op.padding != (0, 0) or node.op.mode != 'max': if node.op.padding != (0, 0) or node.op.mode != 'max':
...@@ -1796,11 +1834,13 @@ def local_gpu_downsample_factor_max(node): ...@@ -1796,11 +1834,13 @@ def local_gpu_downsample_factor_max(node):
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if (isinstance(node.op, downsample.MaxPoolGrad) and if (isinstance(node.op, downsample.MaxPoolGrad) and
node.op.ds == node.op.st): node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if (node.op.padding != (0, 0) or if (node.op.padding != (0, 0) or
node.op.mode != 'max' or node.op.mode != 'max' or
node.op.st != node.op.ds): node.op.st != node.op.ds):
return return
x, z, gz = node.inputs x, z, gz = node.inputs
if (x.owner and isinstance(x.owner.op, HostFromGpu)): if (x.owner and isinstance(x.owner.op, HostFromGpu)):
...@@ -1871,7 +1911,8 @@ def local_gpu_join(node): ...@@ -1871,7 +1911,8 @@ def local_gpu_join(node):
# print "OPT: axis_and_tensors=", axis_and_tensors # print "OPT: axis_and_tensors=", axis_and_tensors
matches = [(not t.owner is None and isinstance(t.owner.op, HostFromGpu)) or matches = [(t.owner is not None and
isinstance(t.owner.op, HostFromGpu)) or
isinstance(t, gof.Constant) for t in axis_and_tensors[1:]] isinstance(t, gof.Constant) for t in axis_and_tensors[1:]]
# print "OPT: matches =", matches # print "OPT: matches =", matches
...@@ -1879,7 +1920,8 @@ def local_gpu_join(node): ...@@ -1879,7 +1920,8 @@ def local_gpu_join(node):
if all(matches): if all(matches):
# the extra gpu_from_host introduced here will # the extra gpu_from_host introduced here will
# be removed by further optimizations # be removed by further optimizations
new_tensors = [as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]] new_tensors = [as_cuda_ndarray_variable(t)
for t in axis_and_tensors[1:]]
new_a_and_t = [axis_and_tensors[0]] + new_tensors new_a_and_t = [axis_and_tensors[0]] + new_tensors
replacement_node = host_from_gpu(gpu_join(*new_a_and_t)) replacement_node = host_from_gpu(gpu_join(*new_a_and_t))
...@@ -1936,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt', ...@@ -1936,7 +1978,6 @@ optdb.register('InplaceGpuBlasOpt',
def get_device_type_sizes(): def get_device_type_sizes():
""" """
Returns Returns
------- -------
tuple tuple
...@@ -1957,7 +1998,8 @@ def get_device_type_sizes(): ...@@ -1957,7 +1998,8 @@ def get_device_type_sizes():
del gpu_int_size del gpu_int_size
del t del t
except Exception as e: except Exception as e:
_logger.warning(("Optimization Warning: " _logger.warning((
"Optimization Warning: "
"Got the following error, but you can ignore it. " "Got the following error, but you can ignore it. "
"This could cause less GpuElemwise fused together.\n" "This could cause less GpuElemwise fused together.\n"
"%s") % e) "%s") % e)
...@@ -1992,7 +2034,7 @@ def max_inputs_to_GpuElemwise(node): ...@@ -1992,7 +2034,7 @@ def max_inputs_to_GpuElemwise(node):
size_param_mandatory = int_size # for numels size_param_mandatory = int_size # for numels
size_param_mandatory += int_size * ndim # for the shape size_param_mandatory += int_size * ndim # for the shape
size_param_mandatory += sum((gpu_ptr_size + int_size * ndim) size_param_mandatory += sum((gpu_ptr_size + int_size * ndim)
for i in node.outputs) for i in node.outputs)
nb_bytes_avail = argument_limit - size_param_mandatory nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_inputs = (ndim * int_size) + gpu_ptr_size nb_bytes_per_inputs = (ndim * int_size) + gpu_ptr_size
...@@ -2032,11 +2074,11 @@ def split_huge_add_or_mul(node): ...@@ -2032,11 +2074,11 @@ def split_huge_add_or_mul(node):
# GpuElemwise fusion # GpuElemwise fusion
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op( gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise, GpuElemwise, max_inputs_to_GpuElemwise)
max_inputs_to_GpuElemwise)
if config.gpu.local_elemwise_fusion: if config.gpu.local_elemwise_fusion:
_logger.debug("enabling optimization fusion of gpu elemwise in fast_run") _logger.debug("enabling optimization fusion of gpu elemwise in fast_run")
# Must be after cpu fusion at 40, gpu at 48.5 and before AddDestroyHandler at 49.5 # Must be after cpu fusion at 40, gpu at 48.5 and before
# AddDestroyHandler at 49.5
optdb.register('gpu_elemwise_fusion', optdb.register('gpu_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion),
49, 'fast_run', 'fusion', 49, 'fast_run', 'fusion',
...@@ -2050,7 +2092,7 @@ else: ...@@ -2050,7 +2092,7 @@ else:
# GpuElemwise inplace # GpuElemwise inplace
gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op( gpu_inplace_elemwise_optimizer = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise) GpuElemwise)
# DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile. # DO NOT PLACE add a 'gpu' tag here! This would enable it in fast_compile.
# It still will be run in fast_run with device=gpu with the current tag. # It still will be run in fast_run with device=gpu with the current tag.
optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75, optdb.register('gpu_inplace_elemwise_opt', gpu_inplace_elemwise_optimizer, 75,
...@@ -2064,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])( ...@@ -2064,7 +2106,8 @@ gpu_elemwise_alloc = gof.local_optimizer([GpuElemwise])(
tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle) tensor.opt.local_elemwise_alloc_op(GpuElemwise, GpuAlloc, GpuDimShuffle)
) )
register_opt()(gpu_elemwise_alloc) register_opt()(gpu_elemwise_alloc)
register_opt()(tensor.opt.local_useless_elemwise) # needed by gpu_elemwise_alloc # needed by gpu_elemwise_alloc
register_opt()(tensor.opt.local_useless_elemwise)
tensor.opt.register_specialize_device(gpu_elemwise_alloc) tensor.opt.register_specialize_device(gpu_elemwise_alloc)
...@@ -2110,8 +2153,7 @@ def local_gpualloc(node): ...@@ -2110,8 +2153,7 @@ def local_gpualloc(node):
new_out.type.broadcastable): new_out.type.broadcastable):
assert b_new or (not b_old) assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out, old_out.broadcastable) new_out = tensor.patternbroadcast(new_out, old_out.broadcastable)
# if old_out.type != new_out.type:
#import pdb; pdb.set_trace()
return [new_out] return [new_out]
...@@ -2134,12 +2176,14 @@ def local_gpualloc_memset_0(node): ...@@ -2134,12 +2176,14 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, CudaNdarrayConstant) and if (isinstance(inp, CudaNdarrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_out = GpuAlloc(memset_0=True)(*node.inputs)
old_bcast = node.outputs[0].type.broadcastable old_bcast = node.outputs[0].type.broadcastable
if new_out.type.broadcastable != old_bcast: if new_out.type.broadcastable != old_bcast:
# check that we did not try discarding a broadcastable dimension # check that we did not try discarding a broadcastable
assert not any(b_old and not b_new for b_old, b_new in zip( # dimension
old_bcast, new_out.type.broadcastable)) assert not any(b_old and not b_new for b_old, b_new in
zip(old_bcast, new_out.type.broadcastable))
# force old broadcasting pattern; we must not change it here # force old broadcasting pattern; we must not change it here
new_out = tensor.patternbroadcast(new_out, old_bcast) new_out = tensor.patternbroadcast(new_out, old_bcast)
return [new_out] return [new_out]
...@@ -2172,6 +2216,7 @@ def local_gpu_eye(node): ...@@ -2172,6 +2216,7 @@ def local_gpu_eye(node):
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"): host_input.owner.op.dtype == "float32"):
return [gpu_eye(*host_input.owner.inputs)] return [gpu_eye(*host_input.owner.inputs)]
if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32": if isinstance(node.op, tensor.Eye) and node.op.dtype == "float32":
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
...@@ -2183,6 +2228,7 @@ def local_gpu_eye(node): ...@@ -2183,6 +2228,7 @@ def local_gpu_eye(node):
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
return as_cuda_ndarray_variable(x) return as_cuda_ndarray_variable(x)
else: else:
return x return x
...@@ -2237,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -2237,6 +2283,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def tensor_to_cuda(x): def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
y = CudaNdarrayType(broadcastable=x.type.broadcastable)() y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
if x.name: if x.name:
y.name = x.name + '[cuda]' y.name = x.name + '[cuda]'
...@@ -2259,7 +2306,8 @@ def local_gpu_extract_diagonal(node): ...@@ -2259,7 +2306,8 @@ def local_gpu_extract_diagonal(node):
theano.tensor.TensorType)): theano.tensor.TensorType)):
inp = node.inputs[0] inp = node.inputs[0]
if inp.owner and isinstance(inp.owner.op, HostFromGpu): if inp.owner and isinstance(inp.owner.op, HostFromGpu):
return [host_from_gpu(nlinalg.extract_diag(as_cuda_ndarray_variable(inp)))] return [host_from_gpu(nlinalg.extract_diag(
as_cuda_ndarray_variable(inp)))]
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
...@@ -2295,6 +2343,7 @@ def gpuScanOptimization(node): ...@@ -2295,6 +2343,7 @@ def gpuScanOptimization(node):
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1): len(host_input.owner.outputs) == 1):
# Note that we are not doing the right thing here !! # Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one # This is because the local optimizer expects only one
# output that corresponds to the input of ``node`` # output that corresponds to the input of ``node``
...@@ -2348,6 +2397,7 @@ def gpuScanOptimization(node): ...@@ -2348,6 +2397,7 @@ def gpuScanOptimization(node):
# scan(host_from_gpu) -> host_from_gpu(GPUscan) # scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan
and not node.op.info['gpu']): and not node.op.info['gpu']):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -2420,4 +2470,132 @@ optdb.register('gpu_scanOp_make_inplace', ...@@ -2420,4 +2470,132 @@ optdb.register('gpu_scanOp_make_inplace',
'inplace', 'inplace',
'scan') 'scan')
# XXX: these optimisations were badly broken and now require a working
# beta param (could only be a 0/1 thing for outer_merge, but
# alpha_merge needs the full range).
# @register_opt()
# @alpha_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=?, nd=4)
# def local_merge_blocksparse_alpha(node, *inputs):
# """
# GpuElemwise{mul}(lr, GpuSparseBlockOuter) ->
# GpuSparseBlockOuter(..., alpha=lr)
# """
# return [gpu_sparse_block_outer(*inputs)]
# @register_opt()
# @output_merge(GpuSparseBlockOuter, alpha_in=5, beta_in=? out_in=0, nd=4)
# def local_merge_blocksparse_output(node, *inputs):
# return [gpu_sparse_block_outer(*inputs)]
def _owner_isinstance(inp, test_class):
"""
Tests whether input has an owner and if its owner is
of type `test_class`
"""
return bool(inp.owner) and isinstance(inp.owner.op, test_class)
def _clear_host_from_gpu(inputs):
"""
Replace any HostFromGpu by its input
"""
clean_inputs = []
for inp in inputs:
if _owner_isinstance(inp, HostFromGpu):
clean_inputs.append(inp.owner.inputs[0])
else:
clean_inputs.append(inp)
return clean_inputs
@register_opt()
@local_optimizer([SparseBlockGemv, GpuFromHost])
def gpu_sparse_block_gemv_opt(node):
"""
SparseBlockGemv(HostFromGpu(input)) ->
HostFromGpu(GpuSparseBlockGemv(input))
or
GpuFromHost(SparseBlockGemv) -> GpuSparseBlockGemv
"""
if isinstance(node.op, SparseBlockGemv) and \
any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
inputs = _clear_host_from_gpu(node.inputs)
return [host_from_gpu(GpuSparseBlockGemv(node.op.inplace)(*inputs))]
elif isinstance(node.op, GpuFromHost) and \
_owner_isinstance(node.inputs[0], SparseBlockGemv):
meta_node = node.inputs[0].owner
inputs = _clear_host_from_gpu(meta_node.inputs)
return [GpuSparseBlockGemv(meta_node.op.inplace)(*inputs)]
@register_opt()
@local_optimizer([SparseBlockOuter, GpuFromHost])
def gpu_sparse_block_outer_opt(node):
"""
SparseBlockOuter(HostFromGpu(input)) ->
HostFromGpu(GpuSparseBlockOuter(input))
or
GpuFromHost(SparseBlockOuter) -> GpuSparseBlockOuter
"""
if isinstance(node.op, SparseBlockOuter) and \
any(_owner_isinstance(inp, HostFromGpu) for inp in node.inputs):
inputs = _clear_host_from_gpu(node.inputs)
return [host_from_gpu(GpuSparseBlockOuter(node.op.inplace)(*inputs))]
elif isinstance(node.op, GpuFromHost) and \
_owner_isinstance(node.inputs[0], SparseBlockOuter):
meta_node = node.inputs[0].owner
inputs = _clear_host_from_gpu(meta_node.inputs)
return [GpuSparseBlockOuter(meta_node.op.inplace)(*inputs)]
@local_optimizer([GpuSparseBlockGemv], inplace=True)
def local_inplace_gpu_sparse_block_gemv(node):
"""
GpuSparseBlockGemv(inplace=False) -> GpuSparseBlockGemv(inplace=True)
"""
if isinstance(node.op, GpuSparseBlockGemv) and not node.op.inplace:
new_node = gpu_sparse_block_gemv_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_gpu_sparse_block_gemv',
TopoOptimizer(
local_inplace_gpu_sparse_block_gemv,
failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu') # DEBUG
@local_optimizer([GpuSparseBlockOuter], inplace=True)
def local_inplace_gpu_sparse_block_outer(node):
"""
GpuSparseBlockOuter(inplace=False) -> GpuSparseBlockOuter(inplace=True)
"""
if isinstance(node.op, GpuSparseBlockOuter) and not node.op.inplace:
new_node = gpu_sparse_block_outer_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_gpu_sparse_block_outer',
TopoOptimizer(
local_inplace_gpu_sparse_block_outer,
failure_callback=TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace', 'gpu') # DEBUG
import theano.sandbox.cuda.extra_ops import theano.sandbox.cuda.extra_ops
import numpy import numpy
from numpy.random import randn
from unittest import TestCase
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano import theano
from theano import tensor from theano import tensor
import theano.tests.unittest_tools as utt import theano.tests.unittest_tools as utt
import theano.sandbox.tests.test_blocksparse
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available: from theano.sandbox.cuda.blocksparse import (GpuSparseBlockOuter,
raise SkipTest('Optional package cuda disabled') gpu_sparse_block_gemv,
gpu_sparse_block_outer)
from theano.sandbox.cuda.basic_ops import (GpuDimShuffle,
as_cuda_ndarray_variable)
from theano.sandbox.cuda.blocksparse import (sparse_block_dot_SS,
sparse_block_gemv_ss,
sparse_block_outer_ss,
sparse_block_outer_ss_inplace,
SparseBlockOuterSS)
from theano.sandbox.cuda.var import float32_shared_constructor from theano.sandbox.cuda.var import float32_shared_constructor
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -29,187 +21,56 @@ else: ...@@ -29,187 +21,56 @@ else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def setup(): class BlockSparse_Gemv_and_Outer(
utt.seed_rng() theano.sandbox.tests.test_blocksparse.BlockSparse_Gemv_and_Outer):
def setUp(self):
utt.seed_rng()
def blocksparse_data(): self.mode = mode_with_gpu.excluding('constant_folding')
nInputBlock = 128 self.gemv_op = gpu_sparse_block_gemv
nOutputBlock = 64 self.outer_op = gpu_sparse_block_outer
inputSize = 40
outputSize = 30
inputWindowSize = 7
outputWindowSize = 9
batchSize = 2
input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
permutation = numpy.random.permutation
inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
for _ in range(batchSize))
outputIndice = numpy.vstack(permutation(nOutputBlock)[:outputWindowSize]
for _ in range(batchSize))
weight = randn(nInputBlock, nOutputBlock,
inputSize, outputSize).astype('float32')
bias = randn(nOutputBlock, outputSize).astype('float32')
return weight, input, inputIndice, bias, outputIndice
def blocksparse(W, h, iIdx, b, oIdx):
o = b.take(oIdx, axis=0)
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
# this below is a gemv I think
o[b, j, :] += numpy.dot(h[b, i], w)
return o
def test_blocksparse():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
test_blocksparse.setup = setup
# test the fortan order for W (which can happen in the grad for some graphs).
def test_blocksparseF():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot_SS(GpuDimShuffle((False, False, False, False),
(0, 1, 3, 2))(
as_cuda_ndarray_variable(W)),
h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
ref_out = blocksparse(W_val, h_val, iIdx_val, b_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
def test_blocksparse_grad():
h_val = randn(1, 2, 3).astype('float32')
iIdx_val = numpy.random.permutation(3)[:2][None, :]
oIdx_val = numpy.random.permutation(3)[:2][None, :]
W_val = randn(3, 3, 3, 4).astype('float32')
b_val = randn(3, 4).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def f(b, h, W):
return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
def test_blocksparse_grad_1():
# This tests that we correctly handle cases where dimensions are 1.
h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :]
W_val = randn(1, 1, 1, 1).astype('float32')
b_val = randn(1, 1).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def f(b, h, W):
return sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(f, [b_val, h_val, W_val], mode=mode_with_gpu)
def test_blocksparse_grad_shape():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
go = theano.grad(o.sum(), [b, W, h])
f = theano.function([W, h, iIdx, b, oIdx], go, mode=mode_with_gpu)
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
# just make sure that it runs correcly and all the shapes are ok.
b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
assert b_g.shape == b_val.shape
assert h_g.shape == h_val.shape
assert W_g.shape == W_val.shape
# This test is temporarily disabled since we disabled the output_merge # This test is temporarily disabled since we disabled the output_merge
# and alpha_merge optimizations for blocksparse due to brokeness. # and alpha_merge optimizations for blocksparse due to brokeness.
# Re-enable when those are re-added. # Re-enable when those are re-added.
def Xtest_blocksparse_grad_merge(): def Xtest_blocksparse_grad_merge(self):
b = tensor.fmatrix() b = tensor.fmatrix()
h = tensor.ftensor3() h = tensor.ftensor3()
iIdx = tensor.lmatrix() iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix() oIdx = tensor.lmatrix()
W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
W = float32_shared_constructor(W_val) W = float32_shared_constructor(W_val)
o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
gW = theano.grad(o.sum(), W) gW = theano.grad(o.sum(), W)
lr = numpy.asarray(0.05, dtype='float32') lr = numpy.asarray(0.05, dtype='float32')
upd = W - lr * gW upd = W - lr * gW
f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
mode=mode_with_gpu) mode=mode_with_gpu)
# Make sure the lr update was merged. # Make sure the lr update was merged.
assert isinstance(f1.maker.fgraph.outputs[0].owner.op, SparseBlockOuterSS) assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
GpuSparseBlockOuter)
# Exclude the merge optimizations. # Exclude the merge optimizations.
mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha') mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
mode = mode.excluding('local_merge_blocksparse_output') mode = mode.excluding('local_merge_blocksparse_output')
f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)
# Make sure the lr update is not merged. # Make sure the lr update is not merged.
assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
SparseBlockOuterSS) GpuSparseBlockOuter)
f2(h_val, iIdx_val, b_val, oIdx_val) f2(h_val, iIdx_val, b_val, oIdx_val)
W_ref = W.get_value() W_ref = W.get_value()
# reset the var # reset the var
W.set_value(W_val) W.set_value(W_val)
f1(h_val, iIdx_val, b_val, oIdx_val) f1(h_val, iIdx_val, b_val, oIdx_val)
W_opt = W.get_value() W_opt = W.get_value()
utt.assert_allclose(W_ref, W_opt) utt.assert_allclose(W_ref, W_opt)
...@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops ...@@ -29,6 +29,9 @@ from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv from theano.scalar.basic_scipy import erfinv
from theano.sandbox.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
...@@ -740,6 +743,36 @@ def test_local_gpu_dot_to_dot22dot(): ...@@ -740,6 +743,36 @@ def test_local_gpu_dot_to_dot22dot():
cmp((3, 4), (4,)) cmp((3, 4), (4,))
def test_blocksparse_gpu_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
def test_blocksparse_gpu_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(),
wrt=W)],
mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
class test_diag(theano.tensor.tests.test_nlinalg.test_diag): class test_diag(theano.tensor.tests.test_nlinalg.test_diag):
mode = mode_with_gpu mode = mode_with_gpu
shared = staticmethod(cuda.shared_constructor) shared = staticmethod(cuda.shared_constructor)
...@@ -756,4 +789,3 @@ if __name__ == '__main__': ...@@ -756,4 +789,3 @@ if __name__ == '__main__':
test_opt_gpujoin_onlyajoin() test_opt_gpujoin_onlyajoin()
test_opt_gpujoin_joinvectors_elemwise_then_minusone() test_opt_gpujoin_joinvectors_elemwise_then_minusone()
test_opt_gpujoin_joinvectors_negativeaxes() test_opt_gpujoin_joinvectors_negativeaxes()
"""
Optimizations addressing the ops in sandbox root directory
"""
from theano import compile # to register the optimizer built by this file
from theano import gof
from theano.sandbox.blocksparse import (
SparseBlockGemv,
SparseBlockOuter,
sparse_block_gemv_inplace,
sparse_block_outer_inplace)
@gof.local_optimizer([SparseBlockGemv], inplace=True)
def local_inplace_sparse_block_gemv(node):
"""
SparseBlockGemv(inplace=False) -> SparseBlockGemv(inplace=True)
"""
if isinstance(node.op, SparseBlockGemv) and not node.op.inplace:
new_node = sparse_block_gemv_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_gemv',
gof.TopoOptimizer(
local_inplace_sparse_block_gemv,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
@gof.local_optimizer([SparseBlockOuter], inplace=True)
def local_inplace_sparse_block_outer(node):
"""
SparseBlockOuter(inplace=False) -> SparseBlockOuter(inplace=True)
"""
if isinstance(node.op, SparseBlockOuter) and not node.op.inplace:
new_node = sparse_block_outer_inplace(*node.inputs)
return [new_node]
return False
compile.optdb.register('local_inplace_sparse_block_outer',
gof.TopoOptimizer(
local_inplace_sparse_block_outer,
failure_callback=gof.TopoOptimizer.warn_inplace),
60, 'fast_run', 'inplace') # DEBUG
"""
Tests for block sparse dot
"""
import unittest
import numpy
from numpy.random import randn
import theano
from theano import tensor
import theano.tests.unittest_tools as utt
from theano.sandbox.blocksparse import sparse_block_dot, \
sparse_block_gemv, sparse_block_outer
class BlockSparse_Gemv_and_Outer(unittest.TestCase):
def runTest(self):
pass
def setUp(self):
utt.seed_rng()
self.mode = theano.compile.get_default_mode().excluding(
'constant_folding'
)
self.gemv_op = sparse_block_gemv
self.outer_op = sparse_block_outer
@staticmethod
def gemv_data():
nInputBlock = 8
nOutputBlock = 7
inputSize = 6
outputSize = 5
inputWindowSize = 4
outputWindowSize = 3
batchSize = 2
# nInputBlock = 2
# nOutputBlock = 2
# inputSize = 2
# outputSize = 2
# inputWindowSize = 1
# outputWindowSize = 1
# batchSize = 1
input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
permutation = numpy.random.permutation
inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
for _ in range(batchSize)).astype('int32')
outputIndice = numpy.vstack(
permutation(nOutputBlock)[:outputWindowSize]
for _ in range(batchSize)).astype('int32')
weight = randn(nInputBlock, nOutputBlock,
inputSize, outputSize).astype('float32')
bias = randn(nOutputBlock, outputSize).astype('float32')
return weight, input, inputIndice, bias, outputIndice
@staticmethod
def outer_data():
nInputBlock = 8
nOutputBlock = 7
xSize = 6
ySize = 5
xWindowSize = 4
yWindowSize = 3
batchSize = 2
o = randn(nInputBlock, nOutputBlock, xSize, ySize).astype('float32')
x = randn(batchSize, xWindowSize, xSize).astype('float32')
y = randn(batchSize, yWindowSize, ySize).astype('float32')
randint = numpy.random.randint
xIdx = numpy.vstack(randint(0, nInputBlock, size=xWindowSize)
for _ in range(batchSize)).astype('int32')
yIdx = numpy.vstack(randint(0, nOutputBlock, size=yWindowSize)
for _ in range(batchSize)).astype('int32')
return o, x, y, xIdx, yIdx
@staticmethod
def gemv_numpy(o, W, h, iIdx, oIdx):
for b in range(o.shape[0]):
for j in range(o.shape[1]):
outputIdx = oIdx[b, j]
for i in range(h.shape[1]):
inputIdx = iIdx[b, i]
w = W[inputIdx, outputIdx]
o[b, j, :] += numpy.dot(h[b, i], w)
return o
@staticmethod
def gemv_numpy2(o, W, h, iIdx, oIdx):
"""
Other implementation
"""
from numpy import ix_
for b in range(o.shape[0]):
w = W[ix_(iIdx[b], oIdx[b])].swapaxes(1, 2)
w = w.reshape((w.shape[0] * w.shape[1], w.shape[2] * w.shape[3]))
o[b] += numpy.dot(h[b].ravel(), w).reshape(o.shape[1:])
return o
@staticmethod
def gemv_numpy3(o, W, h, iIdx, oIdx):
"""
Other implementation
"""
from numpy import ix_
for b in range(o.shape[0]):
w = W[ix_(iIdx[b], oIdx[b])]
# The next three lines do the same operation. The last one is the
# fastest
# o[b] += (h[b][:, None, :, None] * w).sum(axis=(0, 2))
# o[b] += numpy.tensordot(h[b], w, [(0,1),(0,2)])
o[b] += numpy.einsum('ik,ijkl', h[b], w)
return o
@staticmethod
def gemv_data2():
nInputBlock = 100
nOutputBlock = 100
inputSize = 50
outputSize = 50
inputWindowSize = 30
outputWindowSize = 30
batchSize = 1
input = randn(batchSize, inputWindowSize, inputSize).astype('float32')
permutation = numpy.random.permutation
inputIndice = numpy.vstack(permutation(nInputBlock)[:inputWindowSize]
for _ in range(batchSize)).astype('int32')
outputIndice = numpy.vstack(
permutation(nOutputBlock)[:outputWindowSize]
for _ in range(batchSize)).astype('int32')
weight = randn(nInputBlock, nOutputBlock,
inputSize, outputSize).astype('float32')
bias = randn(nOutputBlock, outputSize).astype('float32')
return weight, input, inputIndice, bias, outputIndice
@staticmethod
def outer_numpy(o, x, y, xIdx, yIdx):
for b in range(x.shape[0]):
for i in range(xIdx.shape[1]):
for j in range(yIdx.shape[1]):
o[xIdx[b, i], yIdx[b, j]] += numpy.outer(x[b, i, :],
y[b, j, :])
return o
def test_sparseblockdot(self):
"""
Compares the numpy version of sparseblockgemv to sparse_block_dot.
"""
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.imatrix()
oIdx = tensor.imatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
W_val, h_val, iIdx_val, b_val, oIdx_val = \
BlockSparse_Gemv_and_Outer.gemv_data()
th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
def test_sparseblockgemv(self):
"""
Compares the numpy and theano versions of sparseblockgemv.
"""
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.imatrix()
oIdx = tensor.imatrix()
o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
W_val, h_val, iIdx_val, b_val, oIdx_val = \
BlockSparse_Gemv_and_Outer.gemv_data()
th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
def test_sparseblockgemvF(self):
"""
Test the fortan order for W (which can happen in the grad for some
graphs).
"""
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.imatrix()
oIdx = tensor.imatrix()
o = self.gemv_op(b.take(oIdx, axis=0),
tensor.DimShuffle((False, False, False, False),
(0, 1, 3, 2))
(tensor.as_tensor_variable(W)),
h, iIdx, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode)
W_val, h_val, iIdx_val, b_val, oIdx_val = \
BlockSparse_Gemv_and_Outer.gemv_data()
th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val,
oIdx_val)
ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(
b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val)
utt.assert_allclose(ref_out, th_out)
def test_sparseblockgemv_grad(self):
W_val, h_val, iIdx_val, b_val, oIdx_val = \
BlockSparse_Gemv_and_Outer.gemv_data()
h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :]
W_val = randn(1, 1, 1, 1).astype('float32')
b_val = randn(1, 1).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def metaop(b, h, W):
return sparse_block_dot(W, h, iIdx, b, oIdx)
def op(b, h, W):
return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
def test_sparseblockgemv_grad_1(self):
"""
Test that we correctly handle cases where dimensions are 1.
"""
h_val = randn(1, 1, 1).astype('float32')
iIdx_val = numpy.random.permutation(1)[:1][None, :]
oIdx_val = numpy.random.permutation(1)[:1][None, :]
W_val = randn(1, 1, 1, 1).astype('float32')
b_val = randn(1, 1).astype('float32')
iIdx = theano.tensor.constant(iIdx_val)
oIdx = theano.tensor.constant(oIdx_val)
def metaop(b, h, W):
return sparse_block_dot(W, h, iIdx, b, oIdx)
def op(b, h, W):
return self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
utt.verify_grad(metaop, [b_val, h_val, W_val], mode=self.mode)
utt.verify_grad(op, [b_val, h_val, W_val], mode=self.mode)
def test_sparseblockgemv_grad_shape(self):
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.imatrix()
oIdx = tensor.imatrix()
o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
go = theano.grad(o.sum(), [b, W, h])
f = theano.function([W, h, iIdx, b, oIdx], go, mode=self.mode)
W_val, h_val, iIdx_val, b_val, oIdx_val = \
BlockSparse_Gemv_and_Outer.gemv_data()
# just make sure that it runs correcly and all the shapes are ok.
b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
assert b_g.shape == b_val.shape
assert h_g.shape == h_val.shape
assert W_g.shape == W_val.shape
def test_sparseblockouter(self):
o = tensor.ftensor4()
x = tensor.ftensor3()
y = tensor.ftensor3()
xIdx = tensor.imatrix()
yIdx = tensor.imatrix()
out = self.outer_op(o, x, y, xIdx, yIdx)
f = theano.function([o, x, y, xIdx, yIdx], out,
on_unused_input="warn")
o_val, x_val, y_val, xIdx_val, yIdx_val = \
BlockSparse_Gemv_and_Outer.outer_data()
th_out = f(o_val, x_val, y_val, xIdx_val, yIdx_val)
ref_out = BlockSparse_Gemv_and_Outer.outer_numpy(
o_val, x_val, y_val, xIdx_val, yIdx_val)
utt.assert_allclose(ref_out, th_out)
...@@ -4,7 +4,7 @@ import numpy ...@@ -4,7 +4,7 @@ import numpy
import theano import theano
from theano import config, function, tensor from theano import config, function, tensor
from . import multinomial from theano.sandbox import multinomial
from theano.compile.mode import get_default_mode, predefined_linkers from theano.compile.mode import get_default_mode, predefined_linkers
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
......
import theano
from theano import tensor
from theano.sandbox.blocksparse import sparse_block_dot
def test_blocksparse_inplace_gemv_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
f = theano.function([W, h, iIdx, b, oIdx], o)
if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
def test_blocksparse_inplace_outer_opt():
b = tensor.fmatrix()
W = tensor.ftensor4()
h = tensor.ftensor3()
iIdx = tensor.lmatrix()
oIdx = tensor.lmatrix()
o = sparse_block_dot(W, h, iIdx, b, oIdx)
theano.printing.debugprint(tensor.grad(o.sum(), wrt=W))
f = theano.function([W, h, iIdx, b, oIdx],
[o, tensor.grad(o.sum(), wrt=W)])
if theano.config.mode == "FAST_COMPILE":
assert not f.maker.fgraph.toposort()[-1].op.inplace
else:
assert f.maker.fgraph.toposort()[-1].op.inplace
import theano import theano
import numpy import numpy
from . import scan from theano.sandbox import scan
def test_001(): def test_001():
......
from __future__ import print_function from __future__ import print_function
from .theano_object import * from theano.sandbox.theano_object import *
RUN_TESTS = False RUN_TESTS = False
......
...@@ -98,17 +98,19 @@ whitelist_flake8 = [ ...@@ -98,17 +98,19 @@ whitelist_flake8 = [
"tensor/nnet/tests/test_sigm.py", "tensor/nnet/tests/test_sigm.py",
"scalar/__init__.py", "scalar/__init__.py",
"scalar/tests/test_basic.py", "scalar/tests/test_basic.py",
"sandbox/test_theano_object.py", "sandbox/__init__.py",
"sandbox/test_scan.py",
"sandbox/rng_mrg.py", "sandbox/rng_mrg.py",
"sandbox/theano_object.py", "sandbox/theano_object.py",
"sandbox/scan.py", "sandbox/scan.py",
"sandbox/test_multinomial.py",
"sandbox/test_rng_mrg.py",
"sandbox/test_neighbourhoods.py",
"sandbox/symbolic_module.py", "sandbox/symbolic_module.py",
"sandbox/conv.py", "sandbox/conv.py",
"sandbox/debug.py", "sandbox/debug.py",
"sandbox/tests/test_theano_object.py",
"sandbox/tests/test_scan.py",
"sandbox/tests/test_rng_mrg.py",
"sandbox/tests/test_neighbourhoods.py",
"sandbox/tests/test_multinomial.py",
"sandbox/tests/__init__.py",
"sandbox/cuda/dnn.py", "sandbox/cuda/dnn.py",
"sandbox/cuda/var.py", "sandbox/cuda/var.py",
"sandbox/cuda/GpuConvGrad3D.py", "sandbox/cuda/GpuConvGrad3D.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论