提交 b69ad54d authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py
...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp): ...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5 broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype, outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()]) broadcastable=broad)()])
...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp): ...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2] outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3] outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4] outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype) dCdW = numpy.zeros(WShape, dtype=V.dtype)
...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp): ...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for p in xrange(0, outputHeight): for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth): for q in xrange(0, outputWidth):
for r in xrange(0, outputDur): for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m] dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW output_storage[0][0] = dCdW
...@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp): ...@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvGrad3D > ///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n"); //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
...@@ -285,7 +284,7 @@ if(!work_complete){ ...@@ -285,7 +284,7 @@ if(!work_complete){
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the # It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position. # input pixels that were downsampled to that z-position.
codeSource = """ codeSource = """
__global__ void __global__ void
//thread block size = WShape[4] //thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3]) //grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
......
...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp): ...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else: else:
RShape_ = T.as_tensor_variable([-1, -1, -1]) RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], return theano.Apply(
outputs=[CudaNdarrayType(dtype=H_.dtype, self, inputs=[W_, b_, d_, H_, RShape_],
broadcastable=(False,)*5)()]) outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs W, b, d, H, RShape = node.inputs
...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert dc > 0 assert dc > 0
assert dt > 0 assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1: if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight: if Rshape[0] < videoHeight:
...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else: # else:
# print "No Rshape passed in" # print "No Rshape passed in"
# print "video size: "+str((videoHeight, videoWidth, videoDur)) # print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight, R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize): for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize) # print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels): for j in xrange(0, inputChannels):
# print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels) # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight): for r in xrange(0, videoHeight):
# print '\t\t\trow '+str(r+1)+'/'+str(videoHeight) # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth): for c in xrange(0, videoWidth):
for t in xrange(0, videoDur): for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j] R[i, j, r, c, t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur + 1 )/float(dt))) ]) ftc = max(
fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ]) [0,
int(numpy.ceil(
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ]) float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight: while rc < outputHeight:
rk = r - rc * dr rk = r - rc * dr
if rk < 0: if rk < 0:
...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if tk < 0: if tk < 0:
break break
R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] ) R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1 tc += 1
"" # close loop over tc "" # close loop over tc
......
...@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division ...@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
import copy import copy
import logging import logging
import sys import sys
import warnings
import numpy import numpy
from six import iteritems from six import iteritems
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
...@@ -12,6 +12,9 @@ from theano import gof, Type, Apply ...@@ -12,6 +12,9 @@ from theano import gof, Type, Apply
from theano import tensor, scalar, config from theano import tensor, scalar, config
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.scalar import Scalar from theano.scalar import Scalar
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.elemwise import NaiveAlgo
scal = scalar # somewhere scalar gets reassigned to be a function scal = scalar # somewhere scalar gets reassigned to be a function
...@@ -24,10 +27,6 @@ try: ...@@ -24,10 +27,6 @@ try:
except ImportError: except ImportError:
pass pass
from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.elemwise import NaiveAlgo
_logger_name = 'theano.sandbox.cuda.basic_ops' _logger_name = 'theano.sandbox.cuda.basic_ops'
_logger = logging.getLogger(_logger_name) _logger = logging.getLogger(_logger_name)
...@@ -79,7 +78,7 @@ class HostFromGpu(GpuOp): ...@@ -79,7 +78,7 @@ class HostFromGpu(GpuOp):
"CudaNdarrayType. Got %s with type %s" % (x, "CudaNdarrayType. Got %s with type %s" % (x,
x.type)) x.type))
return Apply(self, [x], [tensor.TensorType(dtype=x.dtype, return Apply(self, [x], [tensor.TensorType(dtype=x.dtype,
broadcastable=x.broadcastable)()]) broadcastable=x.broadcastable)()])
def perform(self, node, inp, out): def perform(self, node, inp, out):
x, = inp x, = inp
...@@ -535,10 +534,10 @@ class GpuCAReduce(GpuOp): ...@@ -535,10 +534,10 @@ class GpuCAReduce(GpuOp):
Parameters Parameters
---------- ----------
pre_scalar_op pre_scalar_op
If present, must be a scalar op with only 1 input. If present, must be a scalar op with only 1 input.
We will execute it on the input value before reduction. We will execute it on the input value before reduction.
Notes Notes
----- -----
This Op is a work in progress. This Op is a work in progress.
...@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp): ...@@ -596,10 +595,8 @@ class GpuCAReduce(GpuOp):
if self.pre_scalar_op: if self.pre_scalar_op:
pre = "pre=%s,red=" % str(self.pre_scalar_op) pre = "pre=%s,red=" % str(self.pre_scalar_op)
return "GpuCAReduce{%s%s}{%s}" % ( return "GpuCAReduce{%s%s}{%s}" % (
pre, pre, str(self.scalar_op),
str(self.scalar_op), ','.join(str(i) for i in self.reduce_mask))
','.join(str(i) for i in self.reduce_mask)
)
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__.update(d) self.__dict__.update(d)
...@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp): ...@@ -775,15 +772,18 @@ class GpuCAReduce(GpuOp):
# check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code. # check if the tensor is ccontiguous, if true, use the c_code_reduce_ccontig code.
# TODO: check if we are ccontiguous when we un-dimshuffle # TODO: check if we are ccontiguous when we un-dimshuffle
# TODO: if only some dims are ccontiguous, call version with less dims. # TODO: if only some dims are ccontiguous, call version with less dims.
print('if(CudaNdarray_is_c_contiguous(%(x)s)){'%locals(), file=sio) print('if(CudaNdarray_is_c_contiguous( %(x)s)){' % locals(),
file=sio)
self.c_code_reduce_ccontig(sio, node, name, x, z, fail) self.c_code_reduce_ccontig(sio, node, name, x, z, fail)
print("}else{", file=sio) print("}else{", file=sio)
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
print("}", file=sio) print("}", file=sio)
else: else:
getattr(self, 'c_code_reduce_%s'%(''.join( getattr(self, 'c_code_reduce_%s' % (''.join(
str(i) for i in self.reduce_mask)))(sio, node, name, x, z, fail) str(i) for i in self.reduce_mask)))(
sio, node, name, x, z, fail)
# \end bracket the reduction ... # \end bracket the reduction ...
print(""" print("""
...@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp): ...@@ -976,7 +976,7 @@ class GpuCAReduce(GpuOp):
assert isinstance(self.scalar_op, (scal.Maximum, assert isinstance(self.scalar_op, (scal.Maximum,
scal.Minimum)) scal.Minimum))
if self.pre_scalar_op: if self.pre_scalar_op:
#dtype = node.inputs[0].dtype # dtype = node.inputs[0].dtype
dtype = 'float32' dtype = 'float32'
dummy_var = scal.Scalar(dtype=dtype)() dummy_var = scal.Scalar(dtype=dtype)()
...@@ -1275,7 +1275,7 @@ class GpuCAReduce(GpuOp): ...@@ -1275,7 +1275,7 @@ class GpuCAReduce(GpuOp):
def c_code_reduce_01X(self, sio, node, name, x, z, fail, N): def c_code_reduce_01X(self, sio, node, name, x, z, fail, N):
""" """
Parameters Parameters
---------- ----------
N : int N : int
...@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp): ...@@ -1834,12 +1834,15 @@ class GpuCAReduce(GpuOp):
version = [15] # the version corresponding to the c code in this Op version = [15] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply(self.scalar_op, Apply(self.scalar_op,
[Scalar(dtype=input.type.dtype)() for input in node.inputs], [Scalar(
[Scalar(dtype=output.type.dtype)() for output in node.outputs]) dtype=input.type.dtype)() for input in node.inputs],
[Scalar(
dtype=output.type.dtype)() for output in node.outputs])
version.extend(self.scalar_op.c_code_cache_version()) version.extend(self.scalar_op.c_code_cache_version())
for i in node.inputs + node.outputs: for i in node.inputs + node.outputs:
version.extend(Scalar(dtype=i.type.dtype).c_code_cache_version()) version.extend(
Scalar(dtype=i.type.dtype).c_code_cache_version())
if all(version): if all(version):
return tuple(version) return tuple(version)
else: else:
...@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp): ...@@ -1946,10 +1949,11 @@ class GpuCAReduce(GpuOp):
%(reducebuf)s %(reducebuf)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
#01, 011, 0111 # 01, 011, 0111
if (0 == self.reduce_mask[0] and if (0 == self.reduce_mask[0] and
all(self.reduce_mask[1:]) and all(self.reduce_mask[1:]) and
nd_in in[2, 3, 4]): nd_in in[2, 3, 4]):
# this kernel uses one block for each row. # this kernel uses one block for each row.
# threads per block for each element per row. # threads per block for each element per row.
...@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp): ...@@ -2117,10 +2121,10 @@ class GpuCAReduce(GpuOp):
# this kernel uses one block for multiple column(up to 32TODO), # this kernel uses one block for multiple column(up to 32TODO),
# threads per block for each element per column. # threads per block for each element per column.
# thread.x = dim 2 contiguous # thread.x = dim 2 contiguous
# thread.y = dim 1 # thread.y = dim 1
# block.x = dim 0 # block.x = dim 0
# block.y = dim 1 rest # block.y = dim 1 rest
init = self._k_init(node, nodename) init = self._k_init(node, nodename)
decl = self._k_decl(node, nodename, pattern="010_inner") decl = self._k_decl(node, nodename, pattern="010_inner")
reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]', reducebuf = self._k_reduce_buf_multiple('Z[i0 * sZ0 + i2*sZ1]',
...@@ -2294,7 +2298,7 @@ class GpuCAReduce(GpuOp): ...@@ -2294,7 +2298,7 @@ class GpuCAReduce(GpuOp):
} }
""" % locals(), file=sio) """ % locals(), file=sio)
if self.reduce_mask == (0, 0, 1, 1): if self.reduce_mask == (0, 0, 1, 1):
# this kernel uses one block for each row, # this kernel uses one block for each row,
# threads per block for each element per row. # threads per block for each element per row.
reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]', reducebuf = self._k_reduce_buf('Z[i0 * sZ0 + i1 * sZ1]',
node, nodename, sub={}) node, nodename, sub={})
...@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp): ...@@ -2470,7 +2474,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
if (x.size % ss) != 0: if (x.size % ss) != 0:
raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.") raise ValueError("When using -1 in new shape, the computed new shape must be an multiple of the original shape.")
shp_new = numpy.copy(shp) shp_new = numpy.copy(shp)
shp_new[m1_idx] = x.size/ss shp_new[m1_idx] = x.size / ss
shp = shp_new shp = shp_new
else: else:
...@@ -2711,7 +2715,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -2711,7 +2715,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
# c code suppose it is int64 # c code suppose it is int64
if x.ndim in [1, 2, 3] and ilist_.dtype in [ if x.ndim in [1, 2, 3] and ilist_.dtype in [
'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']: 'int8', 'int16', 'int32', 'uint8', 'uint16', 'uint32']:
ilist_ = tensor.cast(ilist_, 'int64') ilist_ = tensor.cast(ilist_, 'int64')
bcast = (ilist_.broadcastable[0],) + x_.broadcastable[1:] bcast = (ilist_.broadcastable[0],) + x_.broadcastable[1:]
...@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -2721,7 +2725,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
def perform(self, node, inp, out_): def perform(self, node, inp, out_):
# This don't work as CudaNdarray_Subscript() don't support it. # This don't work as CudaNdarray_Subscript() don't support it.
#super(GpuAdvancedSubtensor1, self).perform(node, inp, out_) # super(GpuAdvancedSubtensor1, self).perform(node, inp, out_)
x, idx = inp x, idx = inp
out, = out_ out, = out_
x_orig = x x_orig = x
...@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -2733,7 +2737,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
if x.ndim <= 3: if x.ndim <= 3:
# CudaNdarray.take only supports ndim <= 3 # CudaNdarray.take only supports ndim <= 3
if self.perform_using_take is not None: if self.perform_using_take is not None:
assert self.perform_using_take == True, ( assert self.perform_using_take is True, (
"GpuAdvancedSubtensor1 used the fast version") "GpuAdvancedSubtensor1 used the fast version")
if idx.dtype != numpy.int64: if idx.dtype != numpy.int64:
if idx.dtype in [numpy.int8, numpy.int16, numpy.int32, if idx.dtype in [numpy.int8, numpy.int16, numpy.int32,
...@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): ...@@ -2762,7 +2766,7 @@ class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
out[0] = o out[0] = o
else: else:
if self.perform_using_take is not None: if self.perform_using_take is not None:
assert self.perform_using_take == False, ( assert self.perform_using_take is False, (
"GpuAdvancedSubtensor1 didn't use the fast version") "GpuAdvancedSubtensor1 didn't use the fast version")
if out_[0][0] is None or out_[0][0].shape != out_shape: if out_[0][0] is None or out_[0][0].shape != out_shape:
o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape) o = cuda_ndarray.cuda_ndarray.CudaNdarray.zeros(out_shape)
...@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3006,8 +3010,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
convert_map = {8: tensor.basic._convert_to_int8, convert_map = {8: tensor.basic._convert_to_int8,
16: tensor.basic._convert_to_int16, 16: tensor.basic._convert_to_int16,
32: tensor.basic._convert_to_int32, 32: tensor.basic._convert_to_int32,
64: tensor.basic._convert_to_int64 64: tensor.basic._convert_to_int64}
}
intwidth = theano.configdefaults.python_int_bitwidth() intwidth = theano.configdefaults.python_int_bitwidth()
ilist_ = convert_map[intwidth](ilist_) ilist_ = convert_map[intwidth](ilist_)
...@@ -3039,8 +3042,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -3039,8 +3042,8 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if ((node.inputs[0].ndim != node.inputs[1].ndim) or if ((node.inputs[0].ndim != node.inputs[1].ndim) or
(node.inputs[0].ndim != 2) or (node.inputs[0].ndim != 2) or
(compute_capability < 2)): (compute_capability < 2)):
raise NotImplementedError("This case does not have C code yet.") raise NotImplementedError("This case does not have C code yet.")
x = inputs[0] x = inputs[0]
...@@ -3212,7 +3215,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -3212,7 +3215,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
return Apply(self, [x, y] + rval.inputs[2:], [x.type()]) return Apply(self, [x, y] + rval.inputs[2:], [x.type()])
def do_type_checking(self, node): def do_type_checking(self, node):
""" """
Should raise NotImplementedError if c_code does not support Should raise NotImplementedError if c_code does not support
the types involved in this node. the types involved in this node.
...@@ -3248,7 +3251,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -3248,7 +3251,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
Parameters Parameters
---------- ----------
x : str x : str
A string identifying an array to be viewed. A string identifying an array to be viewed.
view_ndim : str view_ndim : str
...@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp): ...@@ -3354,7 +3357,6 @@ class GpuFlatten(gof.HideC, tensor.Flatten, GpuOp):
return Apply(self, [x], [out_type()]) return Apply(self, [x], [out_type()])
def gpu_flatten(x, outdim=1): def gpu_flatten(x, outdim=1):
""" """
Implement flatten on the gpu. Implement flatten on the gpu.
...@@ -3378,10 +3380,10 @@ def gpu_flatten(x, outdim=1): ...@@ -3378,10 +3380,10 @@ def gpu_flatten(x, outdim=1):
""" """
x = as_cuda_ndarray_variable(x) x = as_cuda_ndarray_variable(x)
if outdim > 1: if outdim > 1:
dims = tuple(x.shape[:outdim-1])+(-1,) dims = tuple(x.shape[:outdim - 1]) + (-1, )
else: else:
dims = (-1,) dims = (-1, )
return GpuReshape(outdim)(x, dims) return GpuReshape(outdim)(x, dims)
class GpuShape(tensor.Shape, GpuOp): class GpuShape(tensor.Shape, GpuOp):
...@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp): ...@@ -3408,12 +3410,11 @@ class GpuJoin(tensor.Join, GpuOp):
as_tensor_variable_args = [as_cuda_ndarray_variable(x) as_tensor_variable_args = [as_cuda_ndarray_variable(x)
for x in tensors] for x in tensors]
output_maker = \ def output_maker(bcast):
lambda bcast: CudaNdarrayType(broadcastable=bcast)() return(CudaNdarrayType(broadcastable=bcast)())
return tensor.Join._make_node_internal(self, return tensor.Join._make_node_internal(
axis, tensors, self, axis, tensors, as_tensor_variable_args, output_maker)
as_tensor_variable_args, output_maker)
def perform(self, node, axis_and_tensors, out_): def perform(self, node, axis_and_tensors, out_):
out, = out_ out, = out_
...@@ -3464,8 +3465,8 @@ class GpuJoin(tensor.Join, GpuOp): ...@@ -3464,8 +3465,8 @@ class GpuJoin(tensor.Join, GpuOp):
# except for 'axis' # except for 'axis'
def construct_slices(curlen): def construct_slices(curlen):
slices = [slice(None, None, None) for i in \ slices = [slice(None, None, None) for i in
xrange(len(template_shape))] xrange(len(template_shape))]
slices[axis] = slice(curpos, curpos + curlen, None) slices[axis] = slice(curpos, curpos + curlen, None)
return tuple(slices) return tuple(slices)
...@@ -3829,23 +3830,22 @@ class GpuAlloc(GpuAllocEmpty): ...@@ -3829,23 +3830,22 @@ class GpuAlloc(GpuAllocEmpty):
# If the output is a constant, it will have to be deepcopied # If the output is a constant, it will have to be deepcopied
# each time the function is called. So we do not fold. # each time the function is called. So we do not fold.
return False return False
elif ( # The following ops work inplace of their input id 0. # Else if the following ops work inplace of their input id 0.
client[1] == 0 and elif(client[1] == 0 and
isinstance(client[0].op, ( isinstance(client[0].op, (
# Ops that will work inplace on the Alloc. So if they # Ops that will work inplace on the Alloc. So if they
# get constant_folded, they would copy the # get constant_folded, they would copy the
# constant and this is less efficients. # constant and this is less efficients.
# Not doing the constant folding could also lower # Not doing the constant folding could also lower
# the peak memory usage, as we the "constant" won't # the peak memory usage, as we the "constant" won't
# always exists. # always exists.
# theano.tensor.subtensor.AdvancedIncSubtensor, # theano.tensor.subtensor.AdvancedIncSubtensor,
GpuIncSubtensor, GpuIncSubtensor,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1,
theano.sandbox.cuda.blas.GpuGemm, theano.sandbox.cuda.blas.GpuGemm,
theano.sandbox.cuda.blas.GpuGemv, theano.sandbox.cuda.blas.GpuGemv,
theano.sandbox.cuda.blas.GpuGer, theano.sandbox.cuda.blas.GpuGer,))):
))):
return False return False
# If the clients is a transfer, we don't want to fold. We # If the clients is a transfer, we don't want to fold. We
# let the moving opt finish before deciding what to do. # let the moving opt finish before deciding what to do.
...@@ -3859,7 +3859,7 @@ gpu_alloc = GpuAlloc() ...@@ -3859,7 +3859,7 @@ gpu_alloc = GpuAlloc()
class CopyOnNegativeStrides(GpuOp): class CopyOnNegativeStrides(GpuOp):
""" """
Checks if the input has contains negative strides. Checks if the input has contains negative strides.
If it does, returns a c contiguous copy. If it does, returns a c contiguous copy.
""" """
...@@ -4017,7 +4017,7 @@ def scalar(name=None, dtype=None): ...@@ -4017,7 +4017,7 @@ def scalar(name=None, dtype=None):
Parameters Parameters
---------- ----------
dtype dtype
Numeric type (None means to use theano.config.floatX). Numeric type (None means to use theano.config.floatX).
name : str name : str
A name to attach to this variable. A name to attach to this variable.
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import copy
import os import os
import logging import logging
_logger = logging.getLogger(__name__)
from six import integer_types from six import integer_types
from six.moves import StringIO, reduce from six.moves import StringIO, reduce
import theano import theano
from theano import Apply from theano import Apply
from theano import tensor from theano import tensor
...@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp ...@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
_logger = logging.getLogger(__name__)
class GpuBatchedDot(GpuOp): class GpuBatchedDot(GpuOp):
...@@ -29,11 +26,11 @@ class GpuBatchedDot(GpuOp): ...@@ -29,11 +26,11 @@ class GpuBatchedDot(GpuOp):
assert inp1.dtype == "float32" assert inp1.dtype == "float32"
assert inp2.dtype == "float32" assert inp2.dtype == "float32"
assert inp1.ndim == 3 # (batch, a, b) assert inp1.ndim == 3 # (batch, a, b)
assert inp2.ndim == 3 assert inp2.ndim == 3
return theano.Apply(self, [inp1, inp2], return theano.Apply(self, [inp1, inp2],
[self.output_type(inp1, inp2)()]) [self.output_type(inp1, inp2)()])
def output_type(self, inp1, inp2): def output_type(self, inp1, inp2):
return CudaNdarrayType( return CudaNdarrayType(
...@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp): ...@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
} }
} else { } else {
// copy inputs if not contiguous // copy inputs if not contiguous
""" + """ + ("\n".join("""
("\n".join("""
if (( CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1 if (( CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1 && CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1) && CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1)
...@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp): ...@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(%(var)s); Py_XDECREF(%(var)s);
%(var)s = _copy; %(var)s = _copy;
} }
""" % dict(var=var, fail=fail) for var in (bx, by))) """ % dict(var=var, fail=fail) for var in (bx, by))) + """
+ """
// fail if the output is not contiguous; we can't copy it because we // fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory // need to write to the original memory
...@@ -363,7 +358,7 @@ class GpuDot22(GpuOp): ...@@ -363,7 +358,7 @@ class GpuDot22(GpuOp):
if y.type.ndim != 2: if y.type.ndim != 2:
raise TypeError(y) raise TypeError(y)
otype = CudaNdarrayType( otype = CudaNdarrayType(
(x.type.broadcastable[0], y.type.broadcastable[1])) (x.type.broadcastable[0], y.type.broadcastable[1]))
return Apply(self, [x, y], [otype()]) return Apply(self, [x, y], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -451,7 +446,7 @@ class GpuDot22Scalar(GpuOp): ...@@ -451,7 +446,7 @@ class GpuDot22Scalar(GpuOp):
if not tensor.blas._as_scalar(a): if not tensor.blas._as_scalar(a):
raise TypeError(a) raise TypeError(a)
otype = CudaNdarrayType( otype = CudaNdarrayType(
(x.type.broadcastable[0], y.type.broadcastable[1])) (x.type.broadcastable[0], y.type.broadcastable[1]))
return Apply(self, [x, y, a], [otype()]) return Apply(self, [x, y, a], [otype()])
def c_code_cache_version(self): def c_code_cache_version(self):
...@@ -537,8 +532,8 @@ class GpuGemm(GpuOp): ...@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return 'GpuGemm{no_inplace}' return 'GpuGemm{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -562,7 +557,7 @@ class GpuGemm(GpuOp): ...@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return (4,) return (4,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
...@@ -657,8 +652,8 @@ class GpuGemv(GpuOp): ...@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return 'GpuGemv{no_inplace}' return 'GpuGemv{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -682,7 +677,7 @@ class GpuGemv(GpuOp): ...@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return (3,) return (3,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
...@@ -757,8 +752,8 @@ class GpuGer(GpuOp): ...@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return 'GpuGer{no_inplace}' return 'GpuGer{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -782,7 +777,7 @@ class GpuGer(GpuOp): ...@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return (2,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs z_in, a, x, y = inputs
...@@ -934,7 +929,7 @@ class BaseGpuCorrMM(GpuOp): ...@@ -934,7 +929,7 @@ class BaseGpuCorrMM(GpuOp):
# these files # these files
files = ['corr_gemm.cu'] files = ['corr_gemm.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files] for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
...@@ -947,7 +942,7 @@ class BaseGpuCorrMM(GpuOp): ...@@ -947,7 +942,7 @@ class BaseGpuCorrMM(GpuOp):
Parameters Parameters
---------- ----------
bottom bottom
Variable name of the input images in the forward pass, Variable name of the input images in the forward pass,
or the gradient of the input images in backprop wrt. inputs or the gradient of the input images in backprop wrt. inputs
weights weights
...@@ -1001,7 +996,7 @@ class BaseGpuCorrMM(GpuOp): ...@@ -1001,7 +996,7 @@ class BaseGpuCorrMM(GpuOp):
out = bottom out = bottom
else: else:
raise ValueError("direction must be one of 'forward', " raise ValueError("direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'") "'backprop weights', 'backprop inputs'")
# When subsampling, we cannot unambiguously infer the height and width # When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given. # of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size. # Similarly, when pad="half", we cannot infer the weight size.
...@@ -1158,7 +1153,7 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -1158,7 +1153,7 @@ class GpuCorrMM(BaseGpuCorrMM):
Parameters Parameters
---------- ----------
border_mode border_mode
The width of a border of implicit zeros to pad the The width of a border of implicit zeros to pad the
input with. Must be a tuple with 2 elements giving the numbers of rows input with. Must be a tuple with 2 elements giving the numbers of rows
and columns to pad on each side, or a single integer to pad the same and columns to pad on each side, or a single integer to pad the same
on all sides, or a string shortcut setting the padding at runtime: on all sides, or a string shortcut setting the padding at runtime:
...@@ -1174,7 +1169,7 @@ class GpuCorrMM(BaseGpuCorrMM): ...@@ -1174,7 +1169,7 @@ class GpuCorrMM(BaseGpuCorrMM):
but faster. but faster.
Set to `(1, 1)` to disable subsampling. Set to `(1, 1)` to disable subsampling.
pad pad
Deprecated alias for `border_mode`. Deprecated alias for `border_mode`.
Notes Notes
----- -----
...@@ -1247,8 +1242,8 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -1247,8 +1242,8 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
""" """
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
pad=(0, 0)): pad=(0, 0)):
super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad) super(GpuCorrMM_gradWeights, self).__init__(border_mode, subsample, pad)
def make_node(self, img, topgrad, shape=None): def make_node(self, img, topgrad, shape=None):
...@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom, top = inp[:2] bottom, top = inp[:2]
weights, = grads weights, = grads
weights = gpu_contiguous(weights) weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)( d_bottom = GpuCorrMM_gradInputs(
weights, top, bottom.shape[-2:]) self.border_mode, self.subsample)(weights,
d_top = GpuCorrMM(self.border_mode, self.subsample)( top,
bottom, weights) bottom.shape[-2:])
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else () d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_bottom, d_top) + d_height_width return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -1309,8 +1308,8 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1309,8 +1308,8 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
""" """
def __init__(self, border_mode="valid", def __init__(self, border_mode="valid",
subsample=(1, 1), subsample=(1, 1),
pad=(0, 0)): pad=(0, 0)):
super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad) super(GpuCorrMM_gradInputs, self).__init__(border_mode, subsample, pad)
def make_node(self, kern, topgrad, shape=None): def make_node(self, kern, topgrad, shape=None):
...@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights, top = inp[:2] weights, top = inp[:2]
bottom, = grads bottom, = grads
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)( d_weights = GpuCorrMM_gradWeights(
self.border_mode, self.subsample)(
bottom, top, weights.shape[-2:]) bottom, top, weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode, self.subsample)( d_top = GpuCorrMM(
bottom, weights) self.border_mode, self.subsample)(bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else () d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_weights, d_top) + d_height_width return (d_weights, d_top) + d_height_width
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -1412,7 +1414,7 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1412,7 +1414,7 @@ class BaseGpuCorr3dMM(GpuOp):
# these files # these files
files = ['corr3d_gemm.cu'] files = ['corr3d_gemm.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files] for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_code_helper(self, bottom, weights, def c_code_helper(self, bottom, weights,
...@@ -1459,7 +1461,7 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1459,7 +1461,7 @@ class BaseGpuCorr3dMM(GpuOp):
If self.pad == 'half', a variable giving the width of the filters If self.pad == 'half', a variable giving the width of the filters
for direction="backprop weights". for direction="backprop weights".
Ignored otherwise. Ignored otherwise.
depth depth
If self.subsample[2] != 1, a variable giving the depth If self.subsample[2] != 1, a variable giving the depth
of the filters for direction="backprop weights" or the depth of the of the filters for direction="backprop weights" or the depth of the
input images for direction="backprop inputs". input images for direction="backprop inputs".
...@@ -1488,7 +1490,7 @@ class BaseGpuCorr3dMM(GpuOp): ...@@ -1488,7 +1490,7 @@ class BaseGpuCorr3dMM(GpuOp):
out = bottom out = bottom
else: else:
raise ValueError("direction must be one of 'forward', " raise ValueError("direction must be one of 'forward', "
"'backprop weights', 'backprop inputs'") "'backprop weights', 'backprop inputs'")
# When subsampling, we cannot unambiguously infer the height and width # When subsampling, we cannot unambiguously infer the height and width
# of bottom and weights from top, so we require them to be given. # of bottom and weights from top, so we require them to be given.
# Similarly, when pad="half", we cannot infer the weight size. # Similarly, when pad="half", we cannot infer the weight size.
...@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM): ...@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom, weights = inp bottom, weights = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)( d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
weights, top, bottom.shape[-3:]) self.subsample,
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)( self.pad)(weights,
bottom, top, weights.shape[-3:]) top,
bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights, top = inp[:2] weights, top = inp[:2]
bottom, = grads bottom, = grads
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)( d_weights = GpuCorr3dMM_gradWeights(
bottom, top, weights.shape[-3:]) self.border_mode, self.subsample, self.pad)(
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)( bottom, top, weights.shape[-3:])
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.pad)(
bottom, weights) bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else () d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -1938,19 +1949,19 @@ class GpuConv(GpuOp): ...@@ -1938,19 +1949,19 @@ class GpuConv(GpuOp):
raise ValueError(mode) raise ValueError(mode)
def __init__(self, border_mode, def __init__(self, border_mode,
subsample=(1, 1), subsample=(1, 1),
logical_img_hw=None, logical_img_hw=None,
logical_kern_hw=None, logical_kern_hw=None,
logical_kern_align_top=True, logical_kern_align_top=True,
version=-1, version=-1,
direction_hint=None, direction_hint=None,
verbose=0, verbose=0,
kshp=None, kshp=None,
imshp=None, imshp=None,
max_threads_dim0=None, max_threads_dim0=None,
nkern=None, nkern=None,
bsize=None, bsize=None,
fft_opt=True): fft_opt=True):
self.border_mode = border_mode self.border_mode = border_mode
if version != -1: if version != -1:
raise Exception( raise Exception(
...@@ -2107,7 +2118,7 @@ class GpuConv(GpuOp): ...@@ -2107,7 +2118,7 @@ class GpuConv(GpuOp):
# these files # these files
files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu'] files = ['conv_kernel.cu', 'conv_full_kernel.cu', 'conv.cu']
codes = [open(os.path.join(os.path.split(__file__)[0], f)).read() codes = [open(os.path.join(os.path.split(__file__)[0], f)).read()
for f in files] for f in files]
return reduce(str.__add__, codes) return reduce(str.__add__, codes)
def c_code(self, node, nodename, inp, out_, sub): def c_code(self, node, nodename, inp, out_, sub):
...@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
# def perform(self, node, input_storage, output_storage): # def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') # raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return (6) return (6)
......
...@@ -97,7 +97,7 @@ class NaiveAlgo(object): ...@@ -97,7 +97,7 @@ class NaiveAlgo(object):
self.scalar_op.__class__.__name__, nodename, nd), file=sio) self.scalar_op.__class__.__name__, nodename, nd), file=sio)
if (nd): if (nd):
print("\t,", ", ".join("const int dim%i" % i print("\t,", ", ".join("const int dim%i" % i
for i in xrange(nd)), file=sio) for i in xrange(nd)), file=sio)
# declare inputs # declare inputs
for ipos, i in enumerate(node.inputs): for ipos, i in enumerate(node.inputs):
s = ", ".join(["const float * i%i_data" % ipos] + s = ", ".join(["const float * i%i_data" % ipos] +
...@@ -108,8 +108,8 @@ class NaiveAlgo(object): ...@@ -108,8 +108,8 @@ class NaiveAlgo(object):
s = ", ".join(["float * o%i_data" % ipos] + s = ", ".join(["float * o%i_data" % ipos] +
["int o%i_str_%i" % (ipos, d) for d in xrange(nd)]) ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
print("\t,", s, file=sio) print("\t,", s, file=sio)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos # print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio) print("\t)\n{", file=sio)
print(" const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio) print(" const int idx = blockIdx.x * blockDim.x + threadIdx.x;", file=sio)
print(" const int numThreads = blockDim.x * gridDim.x;", file=sio) print(" const int numThreads = blockDim.x * gridDim.x;", file=sio)
...@@ -129,7 +129,7 @@ class NaiveAlgo(object): ...@@ -129,7 +129,7 @@ class NaiveAlgo(object):
print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio) print(" const float * ii_i%i_data = i%i_data;" % (ipos, ipos), file=sio)
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio) print(" float * ii_o%i_data = o%i_data;" % (ipos, ipos), file=sio)
for d in xrange(nd-1, -1, -1): for d in xrange(nd - 1, -1, -1):
if d > 0: if d > 0:
print(" int pos%i = ii %% dim%i;" % (d, d), file=sio) print(" int pos%i = ii %% dim%i;" % (d, d), file=sio)
print(" ii = ii / dim%i;" % d, file=sio) print(" ii = ii / dim%i;" % d, file=sio)
...@@ -161,9 +161,9 @@ class NaiveAlgo(object): ...@@ -161,9 +161,9 @@ class NaiveAlgo(object):
print("ii_o%i_data[0] = o%i_i;" % (ipos, ipos), file=sio) print("ii_o%i_data[0] = o%i_i;" % (ipos, ipos), file=sio)
print(" }", file=sio) print(" }", file=sio)
#indent = " "*(4*d+7) # indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs): # for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print("}", file=sio) print("}", file=sio)
# print sio.getvalue() # print sio.getvalue()
...@@ -211,10 +211,11 @@ class NaiveAlgo(object): ...@@ -211,10 +211,11 @@ class NaiveAlgo(object):
print("// Input ", ipos, str(i.type), file=sio) print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio) print("// Output ", ipos, str(i.type), file=sio)
print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( print(
self.scalar_op.__class__.__name__, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
nodename, (self.scalar_op.__class__.__name__,
'tiling%i'%nd), file=sio) nodename,
'tiling%i' % nd), file=sio)
if (nd): if (nd):
print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio) print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
# declare inputs # declare inputs
...@@ -225,15 +226,15 @@ class NaiveAlgo(object): ...@@ -225,15 +226,15 @@ class NaiveAlgo(object):
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio) print("\t,", s, file=sio)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos # print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio) print("\t)\n{", file=sio)
# For each input that is a scalar which has been broadcasted to a tensor, # For each input that is a scalar which has been broadcasted to a tensor,
# load it into a local variable # load it into a local variable
print(" __shared__ float value0[%i];" % len(node.inputs), file=sio) print(" __shared__ float value0[%i];" % len(node.inputs), file=sio)
print(" __shared__ int shared_dims[%(nd)s];" % locals(), file=sio) print(" __shared__ int shared_dims[%(nd)s];" % locals(), file=sio)
#print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" # print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
print(" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio) print(" if ((threadIdx.x == 0) && (threadIdx.y == 0)) {", file=sio)
for ipos, i in enumerate(node.inputs): for ipos, i in enumerate(node.inputs):
if _logical_scalar(i): if _logical_scalar(i):
...@@ -274,15 +275,18 @@ class NaiveAlgo(object): ...@@ -274,15 +275,18 @@ class NaiveAlgo(object):
# perform the scalar operation on the input and output references # perform the scalar operation on the input and output references
# TODO: What if the scalar_op needs support_code?? # TODO: What if the scalar_op needs support_code??
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(
[scalar.Scalar(dtype=input.type.dtype).make_variable() self.scalar_op,
for input in node.inputs], [scalar.Scalar(
[scalar.Scalar(dtype=output.type.dtype).make_variable() dtype=input.type.dtype).make_variable()
for output in node.outputs]) for input in node.inputs],
, nodename + '_scalar_' [scalar.Scalar(
, get_str_list_logical_scalar(node, value_str='value0[%i]') dtype=output.type.dtype).make_variable()
, ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)] for output in node.outputs]),
, sub=dict(fail='return;')) # TODO: set a failure code somehow!!! nodename + '_scalar_',
get_str_list_logical_scalar(node, value_str='value0[%i]'),
['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
print(" ", task_code, file=sio) print(" ", task_code, file=sio)
print(" }" * nd, file=sio) print(" }" * nd, file=sio)
...@@ -290,9 +294,9 @@ class NaiveAlgo(object): ...@@ -290,9 +294,9 @@ class NaiveAlgo(object):
# TODO: insert runtime stride checks that select the best loop order either here, or in # TODO: insert runtime stride checks that select the best loop order either here, or in
# the host code that launched the kernel (host code probably better spot) # the host code that launched the kernel (host code probably better spot)
#indent = " "*(4*d+7) # indent = " "*(4*d+7)
# for ipos, i in enumerate(node.inputs): # for ipos, i in enumerate(node.inputs):
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' # print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
print("}", file=sio) print("}", file=sio)
print(sio.getvalue()) print(sio.getvalue())
...@@ -319,10 +323,11 @@ class NaiveAlgo(object): ...@@ -319,10 +323,11 @@ class NaiveAlgo(object):
print("// Input ", ipos, str(i.type), file=sio) print("// Input ", ipos, str(i.type), file=sio)
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print("// Output ", ipos, str(i.type), file=sio) print("// Output ", ipos, str(i.type), file=sio)
print("static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( print(
self.scalar_op.__class__.__name__, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" %
nodename, (self.scalar_op.__class__.__name__,
'tiling%i_less_registers'%nd), file=sio) nodename,
'tiling%i_less_registers' % nd), file=sio)
if (nd): if (nd):
print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio) print("\t,", ", ".join("const int dim%i" % i for i in xrange(nd)), file=sio)
# declare inputs # declare inputs
...@@ -333,8 +338,8 @@ class NaiveAlgo(object): ...@@ -333,8 +338,8 @@ class NaiveAlgo(object):
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
print("\t,", s, file=sio) print("\t,", s, file=sio)
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) # print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
#print >> sio, "\t,", "float * o%i_data" % ipos # print >> sio, "\t,", "float * o%i_data" % ipos
print("\t)\n{", file=sio) print("\t)\n{", file=sio)
# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
...@@ -350,6 +355,7 @@ class NaiveAlgo(object): ...@@ -350,6 +355,7 @@ class NaiveAlgo(object):
return "s%s_str[%i][%i]" % (io, p, d) return "s%s_str[%i][%i]" % (io, p, d)
else: else:
return "%s%i_str_%i" % (io, p, d) return "%s%i_str_%i" % (io, p, d)
def limits(d): def limits(d):
if use_shared_limits: if use_shared_limits:
return "limits[%i]" % d return "limits[%i]" % d
...@@ -417,15 +423,19 @@ class NaiveAlgo(object): ...@@ -417,15 +423,19 @@ class NaiveAlgo(object):
def task_code(d): def task_code(d):
print(self.scalar_op.c_code( print(self.scalar_op.c_code(
Apply(self.scalar_op, Apply(
self.scalar_op,
[scalar.Scalar(dtype=input.type.dtype).make_variable() [scalar.Scalar(dtype=input.type.dtype).make_variable()
for input in node.inputs], for input in node.inputs],
[scalar.Scalar(dtype=output.type.dtype).make_variable() [scalar.Scalar(dtype=output.type.dtype).make_variable()
for output in node.outputs]) for output in node.outputs]),
, nodename + '_scalar_' nodename + '_scalar_',
, ['i%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.inputs)] ['i%i_data_%i[0]' % (ipos, d) for ipos,
, ['o%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.outputs)] i in enumerate(node.inputs)],
, sub=dict(fail='return;')), file=sio) # TODO: set a failure code somehow!!! ['o%i_data_%i[0]' % (ipos, d) for ipos,
i in enumerate(node.outputs)],
sub=dict(fail='return;')), file=sio)
# TODO: set a failure code somehow!!!
if nd == 4: if nd == 4:
decl_shared_stride(n_in, n_out, nd) decl_shared_stride(n_in, n_out, nd)
...@@ -495,16 +505,19 @@ class NaiveAlgo(object): ...@@ -495,16 +505,19 @@ class NaiveAlgo(object):
for ipos, i in enumerate(node.outputs): for ipos, i in enumerate(node.outputs):
print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio) print("npy_%s o%d_i;" % (i.dtype, ipos), file=sio)
task_code = self.scalar_op.c_code( task_code = self.scalar_op.c_code(
Apply(self.scalar_op, Apply(
[scalar.Scalar(dtype=input.type.dtype).make_variable() self.scalar_op,
for input in node.inputs], [scalar.Scalar(dtype=input.type.dtype).make_variable()
[scalar.Scalar(dtype=output.type.dtype).make_variable() for input in node.inputs],
for output in node.outputs]) [scalar.Scalar(dtype=output.type.dtype).make_variable()
, nodename + '_scalar_' for output in node.outputs]),
#, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)] nodename + '_scalar_',
, get_str_list_logical_scalar(node, data_str='i%i_data[i]') # , ['i%i_data[i]'%ipos for ipos,
, ['o%i_i'%ipos for ipos, i in enumerate(node.outputs)] # i in enumerate(node.inputs)]
, sub=dict(fail='return;')) # TODO: set a failure code somehow!!! get_str_list_logical_scalar(node, data_str='i%i_data[i]'),
['o%i_i' % ipos for ipos, i in enumerate(node.outputs)],
sub=dict(fail='return;'))
# TODO: set a failure code somehow!!!
print(" ", task_code, file=sio) print(" ", task_code, file=sio)
for ipos, _ in enumerate(node.outputs): for ipos, _ in enumerate(node.outputs):
print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio) print("o%i_data[i] = o%i_i;" % (ipos, ipos), file=sio)
...@@ -539,18 +552,21 @@ class NaiveAlgo(object): ...@@ -539,18 +552,21 @@ class NaiveAlgo(object):
nb_outputs = len(node.outputs) nb_outputs = len(node.outputs)
d = dict() d = dict()
# input_params and output_params go into the function declaration/definition # input_params and output_params go into the function declaration/definition
input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos) input_params = ", ".join(
for ipos in xrange(len(node.inputs))) "const float * i%i_data, const int * i%i_str" % (ipos, ipos)
output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos) for ipos in xrange(len(node.inputs)))
for ipos in xrange(len(node.outputs))) output_params = ", ".join(
"float * o%i_data, const int * o%i_str" % (ipos, ipos)
for ipos in xrange(len(node.outputs)))
# input_args and output_args go into the recursive call. # input_args and output_args go into the recursive call.
input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos) input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
for ipos in xrange(len(node.inputs))) for ipos in xrange(len(node.inputs)))
output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos) output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
for ipos in xrange(len(node.outputs))) for ipos in xrange(len(node.outputs)))
prod_dims = '*'.join(["dims[%i]"%di for di in xrange(nd)]+['1']) prod_dims = '*'.join(
["dims[%i]" % di for di in xrange(nd)] + ['1'])
scalar_op = self.scalar_op.__class__.__name__ scalar_op = self.scalar_op.__class__.__name__
...@@ -578,20 +594,30 @@ class NaiveAlgo(object): ...@@ -578,20 +594,30 @@ class NaiveAlgo(object):
print(""" print("""
std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n";
""" % locals(), file=sio) """ % locals(), file=sio)
print('std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di print(
for di in xrange(nd)) + ["'\\n';"]), file=sio) 'std::cerr << ' + " << ' ' << ".join(
['" "'] +
list("dims[%i]" % di for di in xrange(nd)) +
["'\\n';"]),
file=sio)
if self.verbose > 1: if self.verbose > 1:
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
istrings = [
"i%s_str[%i]" % (ipos, di) for di in xrange(nd)]
ipositions = " << ' ' << ".join(
["i%s_data" % ipos] + istrings)
print(""" print("""
std::cerr << " %(ipos)s data strides" << std::cerr << " %(ipos)s data strides" << %(ipositions)s << "\\n";
""" % locals() + " << ' ' << ".join(["i%s_data"%ipos] """ % dict(ipos=ipos, ipositions=ipositions), file=sio)
+ list("i%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''', file=sio)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print(""" print("""
std::cerr << " %(ipos)s data strides" << std::cerr << " %(ipos)s data strides" <<
""" % locals() + " << ' ' << ".join(["o%s_data"%ipos] """ % locals() + " << ' ' << ".join(
+ list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''', file=sio) ["o%s_data" % ipos] +
list(
"o%s_str[%i]" % (ipos, di) for di in xrange(nd)
)) +
''' << "\\n"; ''', file=sio)
# collapse dimension that are broadcast in all inputs. # collapse dimension that are broadcast in all inputs.
# need to be done before contiguous collapse as it will break it. # need to be done before contiguous collapse as it will break it.
# do the dimensions and the strides # do the dimensions and the strides
...@@ -636,11 +662,19 @@ class NaiveAlgo(object): ...@@ -636,11 +662,19 @@ class NaiveAlgo(object):
print('std::cerr << "\\n";', file=sio) print('std::cerr << "\\n";', file=sio)
if nd > 0: if nd > 0:
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
print('std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \ print(
' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) 'std::cerr << " local_str inputs %(ipos)s: " <<' % locals() +
' << " " << '.join(["local_str[%s][%s]" % (ipos, x)
for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print('std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \ print(
' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) 'std::cerr << " local_ostr inputs %(ipos)s: " <<' %
locals() +
' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
print(""" print("""
for(int id=0;id<nd_collapse;id++){ for(int id=0;id<nd_collapse;id++){
...@@ -668,35 +702,51 @@ class NaiveAlgo(object): ...@@ -668,35 +702,51 @@ class NaiveAlgo(object):
nd_collapse--; id--; nd_collapse--; id--;
} }
} }
"""%locals(), file=sio) """ % locals(), file=sio)
if self.verbose > 2: if self.verbose > 2:
print('std::cerr <<"after broadcast collapse\\n";', file=sio) print('std::cerr <<"after broadcast collapse\\n";', file=sio)
print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio) print('std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ', file=sio)
print('std::cerr << "local_dims";', file=sio) print('std::cerr << "local_dims";', file=sio)
for d in xrange(nd): for d in xrange(nd):
print('std::cerr << " " << local_dims[%(d)s]; '%locals(), file=sio) print('std::cerr << " " << local_dims[%(d)s]; ' %
locals(), file=sio)
print('std::cerr << "\\n";', file=sio) print('std::cerr << "\\n";', file=sio)
if nd > 0: if nd > 0:
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
print('std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) print('std::cerr << " local_str %(ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_str[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print('std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) print(
'std::cerr << " local_ostr %(ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
# this is a good idea because we make less index calculation in the gpu. # this is a good idea because we make less index calculation in the gpu.
if nd > 0: if nd > 0:
print("int nd_collapse_[%(nd)s] = {"%locals() + ','.join(['1' for x in xrange(nd)]) + "};", file=sio) print("int nd_collapse_[%(nd)s] = {" %
locals() + ','.join(
['1' for x in xrange(nd)]) + "};", file=sio)
else: else:
print("int *nd_collapse_ = NULL;", file=sio) print("int *nd_collapse_ = NULL;", file=sio)
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
if not _logical_scalar(node.inputs[ipos]): if not _logical_scalar(node.inputs[ipos]):
if nd > 0: if nd > 0:
print(""" print("""
int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() + ','.join(['1' for x in xrange(nd)]) + "};", file=sio) int nd_collapse_%(ipos)s[%(nd)s] = {""" %
locals() +
','.join(['1' for x in xrange(nd)]) +
"};", file=sio)
else: else:
print(""" print("""
int *nd_collapse_%(ipos)s = NULL;"""%locals(), file=sio) int * nd_collapse_%(ipos)s = NULL;""" %
locals(), file=sio)
print(""" print("""
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){ for(int i=0;i<nd_collapse;i++){
...@@ -707,8 +757,10 @@ nd_collapse_[i]=0; ...@@ -707,8 +757,10 @@ nd_collapse_[i]=0;
if self.verbose > 1: if self.verbose > 1:
print(""" print("""
std::cerr<< "nd_collapse_%(ipos)s "<< std::cerr<< "nd_collapse_%(ipos)s "<<
"""%locals(), file=sio) """ % locals(), file=sio)
print(' << " " << '.join(["nd_collapse_%s[" % ipos + str(i)+"]" for i in xrange(nd)]), file=sio) print(' << " " << '.join(["nd_collapse_ %s[" %
ipos + str(i) + "]" for i in xrange(nd)]),
file=sio)
print('<< "\\n";', file=sio) print('<< "\\n";', file=sio)
# update the local stride. # update the local stride.
...@@ -721,7 +773,7 @@ nd_collapse_[i]=0; ...@@ -721,7 +773,7 @@ nd_collapse_[i]=0;
local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j]; local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
} }
} }
"""%locals(), file=sio) """ % locals(), file=sio)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print(""" print("""
...@@ -732,7 +784,7 @@ nd_collapse_[i]=0; ...@@ -732,7 +784,7 @@ nd_collapse_[i]=0;
local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j]; local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
} }
} }
"""%locals(), file=sio) """ % locals(), file=sio)
# update the local dims. # update the local dims.
print(""" print("""
...@@ -743,16 +795,20 @@ nd_collapse_[i]=0; ...@@ -743,16 +795,20 @@ nd_collapse_[i]=0;
local_dims[j-1]=local_dims[j]; local_dims[j-1]=local_dims[j];
} }
} }
"""%locals(), file=sio) """ % locals(), file=sio)
# update the new number of dim # update the new number of dim
print(""" print("""
for(int i=1, end=nd_collapse;i<end;i++){ for(int i=1, end=nd_collapse;i<end;i++){
if(nd_collapse_[i]==1)nd_collapse--; if(nd_collapse_[i]==1)nd_collapse--;
} }
if(nd_collapse == 1 """%locals(), file=sio) if(nd_collapse == 1 """ % locals(), file=sio)
l = ["local_str[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])] l = ["local_str[%s][nd_collapse-1]==1 " %
l += ["local_ostr[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])] ipos for ipos in xrange(len(node.inputs)) if not
_logical_scalar(node.inputs[ipos])]
l += ["local_ostr[%s][nd_collapse-1]==1 " %
ipos for ipos in xrange(len(node.outputs)) if not
_logical_scalar(node.outputs[ipos])]
if len(l) > 0: if len(l) > 0:
print(" && ", " && ".join(l), file=sio) print(" && ", " && ".join(l), file=sio)
print("""){nd_collapse=0;} """, file=sio) print("""){nd_collapse=0;} """, file=sio)
...@@ -762,20 +818,31 @@ nd_collapse_[i]=0; ...@@ -762,20 +818,31 @@ nd_collapse_[i]=0;
print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio) print("""std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(), file=sio)
if self.verbose > 1: if self.verbose > 1:
for d in xrange(nd): for d in xrange(nd):
print('std::cerr << " " << local_dims[%(d)s]; '%locals(), file=sio) print('std::cerr << " " << local_dims[%(d)s]; ' %
locals(),
file=sio)
print('std::cerr << "\\n";', file=sio) print('std::cerr << "\\n";', file=sio)
if nd > 0: if nd > 0:
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
print('std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) print(
'std::cerr << " local_str % (ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_str[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
print('std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";', file=sio) print('std::cerr << " local_ostr % (ipos)s: " <<' %
locals() + ' << " " << '.join(
["local_ostr[%s][%s]" %
(ipos, x) for x in xrange(nd)]) +
'<<"\\n";', file=sio)
def launch_Ccontiguous(nodename, scalar_op, sync=True): def launch_Ccontiguous(nodename, scalar_op, sync=True):
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
kernel_call_args.append("i%i_data"%ipos) kernel_call_args.append("i%i_data" % ipos)
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
kernel_call_args.append("o%i_data"%ipos) kernel_call_args.append("o%i_data" % ipos)
kernel_call_args = ", ".join(kernel_call_args) kernel_call_args = ", ".join(kernel_call_args)
verb = "" verb = ""
if self.verbose: if self.verbose:
...@@ -817,20 +884,27 @@ nd_collapse_[i]=0; ...@@ -817,20 +884,27 @@ nd_collapse_[i]=0;
# kernel_call_args are used to invoke the cuda kernel # kernel_call_args are used to invoke the cuda kernel
local = "local_" local = "local_"
kernel_call_args = ["numEls"] kernel_call_args = ["numEls"]
kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd)) kernel_call_args.extend(
local + "dims[%i]" %
di for di in xrange(force_nd))
for ipos in xrange(len(node.inputs)): for ipos in xrange(len(node.inputs)):
kernel_call_args += ["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd)) kernel_call_args += ["i%i_data" % ipos] + list(
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) local + "str[%i][%i]" %
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) (ipos, di) for di in xrange(force_nd))
# strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
for ipos in xrange(len(node.outputs)): for ipos in xrange(len(node.outputs)):
kernel_call_args += ["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd)) kernel_call_args += ["o%i_data" % ipos] + list(
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) local + "ostr[%i][%i]" %
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) (ipos, di) for di in xrange(force_nd))
# strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
# kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
if self.verbose: if self.verbose:
print(""" print("""
std::cerr << " Running general version with %(force_nd)s dims\\n"; std::cerr << " Running general version with %(force_nd)s dims\\n";
"""%locals(), file=sio) """ % locals(), file=sio)
print("std::cerr << " + ' << " " << '.join(kernel_call_args)+' << "\\n";', file=sio) print("std::cerr << " + ' << " " << '.join(
kernel_call_args) + ' << "\\n";', file=sio)
# std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; # std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
kernel_call_args = ", ".join(kernel_call_args) kernel_call_args = ", ".join(kernel_call_args)
...@@ -866,12 +940,13 @@ nd_collapse_[i]=0; ...@@ -866,12 +940,13 @@ nd_collapse_[i]=0;
else: else:
print(" return 0; " % locals(), file=sio) print(" return 0; " % locals(), file=sio)
print("if(numEls==0) return 0;", file=sio) print("if(numEls==0) return 0;", file=sio)
print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals(), file=sio) print("switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" %
locals(), file=sio)
print("case 0: {", file=sio) print("case 0: {", file=sio)
launch_Ccontiguous(nodename, scalar_op, self.sync) launch_Ccontiguous(nodename, scalar_op, self.sync)
print(" } break;", file=sio) print(" } break;", file=sio)
for i in xrange(1, nd+1): for i in xrange(1, nd + 1):
print("case "+str(i)+": {", file=sio) print("case " + str(i) + ": {", file=sio)
launch_General(nodename, scalar_op, i, self.sync) launch_General(nodename, scalar_op, i, self.sync)
print(" } break;", file=sio) print(" } break;", file=sio)
...@@ -889,9 +964,10 @@ nd_collapse_[i]=0; ...@@ -889,9 +964,10 @@ nd_collapse_[i]=0;
#define INTMOD_POW2(a, b) (a & ((1<<b)-1)) #define INTMOD_POW2(a, b) (a & ((1<<b)-1))
""" """
kernels = "".join( kernels = "".join(
[self.c_src_kernel(node, nodename, x) for x in xrange(1, nd + 1)] [self.c_src_kernel(node, nodename, x)
+ [self.c_src_kernel_Ccontiguous(node, nodename)] for x in xrange(1, nd + 1)] +
+ [self.c_src_callkernel(node, nodename)]) [self.c_src_kernel_Ccontiguous(node, nodename)] +
[self.c_src_callkernel(node, nodename)])
return defines + kernels return defines + kernels
def c_support_code(self): def c_support_code(self):
......
...@@ -5,9 +5,9 @@ import numpy as np ...@@ -5,9 +5,9 @@ import numpy as np
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse from theano.ifelse import ifelse
from theano.misc.pycuda_init import pycuda_available
if cuda_available: if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType, from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
...@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
o1 = i1 + 1 o1 = i1 + 1
input_padded = T.zeros((b, ic, o0, o1), dtype='float32') input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1], input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
input) input)
else: else:
o1 = i1 o1 = i1
input_padded = input input_padded = input
...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v, output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape, input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape) filter_shape=filters_fft_v_shape)
#output_fft_s = input_fft_v # output_fft_s = input_fft_v
# reshape for IFFT # reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2)) output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
#output = output_circ[:, :, :, :, :] # output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the # Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT. # trip through FFT and inverse FFT.
......
...@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn): ...@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
Notes Notes
----- -----
buf should be in gpu shared memory, we access it many times. buf should be in gpu shared memory, we access it many times.
""" """
...@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
return [ return [ # get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0]) inline_reduce_max(N, buf, threadPos, threadCount),
inline_reduce_max(N, buf, threadPos, threadCount), '__syncthreads()',
'__syncthreads()', 'float row_max = ' + buf + '[0]',
'float row_max = ' + buf + '[0]', '__syncthreads()',
'__syncthreads()', 'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
'for(int __i=' + threadPos + '; __i<' + N + threadCount + '){',
'; __i+=' + threadCount + '){', buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)', buf2 + '[__i] = ' + buf + '[__i]', '}',
buf2 + '[__i] = ' + buf + '[__i]', '__syncthreads()',
'}', inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), 'float row_sum = ' + buf + '[0]',
'__syncthreads()', '__syncthreads()',
'float row_sum = ' + buf + '[0]', # divide each exp() result by the sum to complete the job.
'__syncthreads()', 'for(int __i=' + threadPos + '; __i<' + N +
# divide each exp() result by the sum to complete the job. '; __i+=' + threadCount + '){',
'for(int __i=' + threadPos + '; __i<' + N + buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'; __i+=' + threadCount + '){', '__syncthreads()',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', ]
'}',
'__syncthreads()',
]
@code_version((1,)) @code_version((1,))
...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals()) init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" % loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos)) r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos)) r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
# This is work in progress # This is work in progress
from theano import Op, Apply, tensor from theano import Apply, tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
......
...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp): ...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel(
params=['int M', 'int N', "kSoftmax_%s" % nodename,
'const float * x', 'const int sx0', 'const int sx1', params=['int M', 'int N',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'const float * x',
body=[ 'const int sx0',
"extern __shared__ float buf[]", 'const int sx1',
"float * buf2 = buf + N", 'float * sm',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'const int sm_s0',
" blockIDX += gridDim.x){", 'const int sm_s1'],
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", body=["extern __shared__ float buf[]",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "float * buf2 = buf + N",
"buf2[tx] = buf[tx]", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"}", " blockIDX += gridDim.x){",
"__syncthreads()", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
inline_softmax('N', 'buf', 'buf2', "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
'threadIdx.x', 'blockDim.x'), "buf2[tx] = buf[tx]", "}", "__syncthreads()",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", inline_softmax('N',
# This set all value correctly 'buf',
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", 'buf2',
"}", 'threadIdx.x',
"__syncthreads()", 'blockDim.x'),
"}", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]) # This set all value correctly
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
params=['int M', 'int N', "__syncthreads()", "}", ])
'const float * x', 'const int sx0', 'const int sx1', ret2 = nvcc_kernel(
'float * sm', 'const int sm_s0', 'const int sm_s1'], "kSoftmax_fixed_shared%s" % nodename,
body=[ params=['int M', 'int N',
"extern __shared__ float buf[]", 'const float * x', 'const int sx0', 'const int sx1',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'float * sm', 'const int sm_s0', 'const int sm_s1'],
" blockIDX += gridDim.x){", body=["extern __shared__ float buf[]",
"const float *x_ptr = &x[blockIDX * sx0]", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"float *sm_ptr = &sm[blockIDX * sm_s0]", " blockIDX += gridDim.x){",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', "const float *x_ptr = &x[blockIDX * sx0]",
'sm_ptr', 'sm_s1', "float *sm_ptr = &sm[blockIDX * sm_s0]",
'threadIdx.x', 'blockDim.x'), inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
"__syncthreads()", 'sm_ptr', 'sm_s1',
"}", 'threadIdx.x',
]) 'blockDim.x'),
"__syncthreads()", "}", ])
return ret1 + "\n" + ret2 return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0', 'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=["extern __shared__ float buf[]",
"extern __shared__ float buf[]", "float * buf2 = buf + N",
"float * buf2 = buf + N", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){",
" blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] += b[tx * sb0]",
"buf[tx] += b[tx * sb0]", "buf2[tx] = buf[tx]", "}",
"buf2[tx] = buf[tx]", "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
"}", 'threadIdx.x',
"__syncthreads()", 'blockDim.x'),
inline_softmax('N', 'buf', 'buf2', "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
'threadIdx.x', 'blockDim.x'), "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "__syncthreads()", "}", ])
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const float * x',
...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]", "float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr',
'sm_s1',
'threadIdx.x', 'threadIdx.x',
'blockDim.x', 'blockDim.x',
'b', 'sb0'), 'b', 'sb0'),
......
...@@ -4,7 +4,6 @@ import logging ...@@ -4,7 +4,6 @@ import logging
import os import os
import subprocess import subprocess
import sys import sys
import warnings
from locale import getpreferredencoding from locale import getpreferredencoding
import numpy import numpy
...@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler): ...@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
_logger.debug('Writing module C++ code to %s', cppfilename) _logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code) cppfile.write(src_code)
lib_filename = os.path.join(location, '%s.%s' % lib_filename = os.path.join(
(module_name, get_lib_extension())) location, '%s.%s' %
(module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename) _logger.debug('Generating shared lib %s', lib_filename)
# TODO: Why do these args cause failure on gtx285 that has 1.3 # TODO: Why do these args cause failure on gtx285 that has 1.3
...@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler): ...@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
continue continue
for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I', for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
'-L', '--fmad', '--ftz', '--maxrregcount', '-L', '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math', '--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount', '-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math', '-prec-div', '-prec-sqrt', '-use_fast_math',
'--use-local-env', '--cl-version=']: '--use-local-env', '--cl-version=']:
...@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler): ...@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
# https://wiki.debian.org/RpathIssue for details. # https://wiki.debian.org/RpathIssue for details.
if (not type(config.cuda).root.is_default and if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))): os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib')) rpaths.append(os.path.join(config.cuda.root, 'lib'))
if sys.platform != 'darwin': if sys.platform != 'darwin':
...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler): ...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof = cmd.index('-u') indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u cmd.pop(indexof) # Remove argument to -u
except ValueError as e: except ValueError:
done = True done = True
# CUDA Toolkit v4.1 Known Issues: # CUDA Toolkit v4.1 Known Issues:
...@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler): ...@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
try: try:
os.chdir(location) os.chdir(location)
p = subprocess.Popen( p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2] nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
console_encoding = getpreferredencoding() console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding) nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding) nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally: finally:
os.chdir(orig_dir) os.chdir(orig_dir)
......
...@@ -10,22 +10,32 @@ import warnings ...@@ -10,22 +10,32 @@ import warnings
import numpy import numpy
from six.moves import reduce, xrange from six.moves import reduce, xrange
from . import dnn
import theano import theano
from theano import scalar as scal from theano import scalar as scal
from theano import config, tensor, gof from theano import config, tensor, gof
import theano.ifelse import theano.ifelse
import theano.tensor.signal.pool
import theano.tensor.nnet
import theano.tensor.nnet.neighbours
# Convolution
from theano.tensor.nnet import conv
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
# Pooling
import theano.tensor.signal.pool as pool
from theano.compile import optdb from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB, from theano.gof import (local_optimizer, EquilibriumDB, ProxyDB,
Optimizer, TopoOptimizer, toolbox) Optimizer, TopoOptimizer, toolbox)
from theano.gof.opt import LocalMetaOptimizer from theano.gof.opt import LocalMetaOptimizer
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
from theano.sandbox.cuda import as_cuda_ndarray_variable from theano.sandbox.cuda import as_cuda_ndarray_variable
from theano.sandbox.cuda.basic_ops import ( from theano.sandbox.cuda.basic_ops import (
gpu_eye, gpu_contiguous, gpu_eye, gpu_contiguous,
gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu, gpu_from_host, host_from_gpu, GpuFromHost, HostFromGpu,
GpuContiguous, GpuContiguous,
GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce, GpuElemwise, GpuDimShuffle, GpuReshape, GpuCAReduce,
GpuFlatten, gpu_flatten, gpu_flatten,
GpuSubtensor, GpuAdvancedSubtensor1, GpuSubtensor, GpuAdvancedSubtensor1,
GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20,
GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty) GpuIncSubtensor, gpu_alloc, GpuAlloc, gpu_shape, GpuSplit, GpuAllocEmpty)
...@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')( ...@@ -137,8 +147,6 @@ register_opt(name='local_gpu_reshape_chain')(
# This is a partial list of CPU ops that can be in some circonstance # This is a partial list of CPU ops that can be in some circonstance
# moved to the GPU. This list is used by an optimization. # moved to the GPU. This list is used by an optimization.
# Hopefully, we can keep this list up to date. # Hopefully, we can keep this list up to date.
import theano.tensor.signal.pool
import theano.tensor.nnet.neighbours
cpu_ops_moved_to_gpu = [ cpu_ops_moved_to_gpu = [
tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm, tensor.blas.Dot22, tensor.blas.Dot22Scalar, tensor.blas.Gemm,
tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp, tensor.blas.Gemv, tensor.blas.Ger, tensor.nnet.conv.ConvOp,
...@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node): ...@@ -630,7 +638,7 @@ def local_gpu_batched_dot(node):
if y.ndim == 2: if y.ndim == 2:
y_ = y_.dimshuffle(0, 1, "x") y_ = y_.dimshuffle(0, 1, "x")
z = GpuBatchedDot()(as_cuda_ndarray_variable(x_), z = GpuBatchedDot()(as_cuda_ndarray_variable(x_),
as_cuda_ndarray_variable(y_)) as_cuda_ndarray_variable(y_))
# unpad z shape # unpad z shape
if x.ndim == 2: if x.ndim == 2:
z = z.dimshuffle(0, *range(2, z.ndim)) z = z.dimshuffle(0, *range(2, z.ndim))
...@@ -850,8 +858,8 @@ def local_gpu_careduce(node): ...@@ -850,8 +858,8 @@ def local_gpu_careduce(node):
if x.type == node.outputs[0].type: if x.type == node.outputs[0].type:
return [x] return [x]
elif (all([c != "output" and isinstance(c.op, GpuFromHost) elif (all([c != "output" and isinstance(c.op, GpuFromHost)
for c, i in node.outputs[0].clients]) for c, i in node.outputs[0].clients]) and
and x.owner and x.owner.op.__class__ in x.owner and x.owner.op.__class__ in
cpu_ops_moved_to_gpu): cpu_ops_moved_to_gpu):
# It is not always good to transfer the reduction to # It is not always good to transfer the reduction to
# the GPU when the clients are on the GPU but not the # the GPU when the clients are on the GPU but not the
...@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node): ...@@ -970,7 +978,7 @@ def local_gpu_elemwise_careduce(node):
# automatically add more case, as some like trigonometic # automatically add more case, as some like trigonometic
# operation with some reduction pattern will probably result # operation with some reduction pattern will probably result
# to slow down. # to slow down.
isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, scal.basic.Sqr)):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
...@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node): ...@@ -1023,7 +1031,8 @@ def local_gpu_flatten(node):
return [gpu_flatten(host_input.owner.inputs[0], outdim)( return [gpu_flatten(host_input.owner.inputs[0], outdim)(
as_cuda_ndarray_variable(host_input.owner.inputs[0]))] as_cuda_ndarray_variable(host_input.owner.inputs[0]))]
if isinstance(node.op, tensor.Flatten): if isinstance(node.op, tensor.Flatten):
x, = node.inputs x, shp = node.inputs
outdim = node.op.outdim
if x.owner and isinstance(x.owner.op, HostFromGpu): if x.owner and isinstance(x.owner.op, HostFromGpu):
outdim = node.op.outdim outdim = node.op.outdim
gpu_x, = x.owner.inputs gpu_x, = x.owner.inputs
...@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node): ...@@ -1050,15 +1059,13 @@ def local_gpu_subtensor(node):
*coords)] *coords)]
if isinstance(node.op, tensor.Subtensor): if isinstance(node.op, tensor.Subtensor):
x = node.inputs[0] x = node.inputs[0]
if (x.owner and if (x.owner and x.dtype == "float32" and
isinstance(x.owner.op, HostFromGpu) and isinstance(x.owner.op, HostFromGpu)):
x.dtype == "float32"):
gpu_x = x.owner.inputs[0] gpu_x = x.owner.inputs[0]
if (gpu_x.owner and if (gpu_x.owner and # And it is a shared var or an input of the graph.
isinstance(gpu_x.owner.op, GpuFromHost) and not(gpu_x.owner.inputs[0].owner) and
# And it is a shared var or an input of the graph. isinstance(gpu_x.owner.op, GpuFromHost)):
not gpu_x.owner.inputs[0].owner):
if len(x.clients) == 1: if len(x.clients) == 1:
if any([n == 'output' or isinstance(n.op, GpuOp) if any([n == 'output' or isinstance(n.op, GpuOp)
...@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1119,9 +1126,7 @@ def local_gpu_advanced_incsubtensor1(node):
'least \'0.6\'.', stacklevel=1) 'least \'0.6\'.', stacklevel=1)
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
...@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node): ...@@ -1162,9 +1167,7 @@ def local_gpu_advanced_incsubtensor1(node):
active_device_no = theano.sandbox.cuda.active_device_number() active_device_no = theano.sandbox.cuda.active_device_number()
compute_capability = device_properties(active_device_no)['major'] compute_capability = device_properties(active_device_no)['major']
if (compute_capability < 2 or if (compute_capability < 2 or y.ndim != 2 or x.ndim != 2):
x.ndim != 2 or
y.ndim != 2):
gpu_op = GpuAdvancedIncSubtensor1( gpu_op = GpuAdvancedIncSubtensor1(
set_instead_of_inc=set_instead_of_inc) set_instead_of_inc=set_instead_of_inc)
else: else:
...@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node): ...@@ -1203,8 +1206,8 @@ def local_gpu_incsubtensor(node):
# Incrementing a float32 x results in a float32 # Incrementing a float32 x results in a float32
# output even if y is float64, so we can downcast # output even if y is float64, so we can downcast
# y to put it on GPU # y to put it on GPU
elif type(node.op) == tensor.IncSubtensor and \ elif (type(node.op) == tensor.IncSubtensor and
node.inputs[0].dtype == "float32": node.inputs[0].dtype == "float32"):
x, y = node.inputs[0:2] x, y = node.inputs[0:2]
assert isinstance(x.type, tensor.TensorType) assert isinstance(x.type, tensor.TensorType)
assert isinstance(y.type, tensor.TensorType) assert isinstance(y.type, tensor.TensorType)
...@@ -1346,8 +1349,6 @@ def cast(x, dtype): ...@@ -1346,8 +1349,6 @@ def cast(x, dtype):
cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype))) cast_op = theano.tensor.Elemwise(scal.Identity(scal.specific_out(stype)))
return cast_op(x) return cast_op(x)
import theano.tensor.nnet
@register_opt() @register_opt()
@local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias]) @local_optimizer([tensor.nnet.CrossentropySoftmaxArgmax1HotWithBias])
...@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node): ...@@ -1419,18 +1420,13 @@ def local_gpu_softmax_with_bias(node):
return False return False
# Convolution
from theano.tensor.nnet import conv
def _gpu_conv_to_fftconv(node): def _gpu_conv_to_fftconv(node):
# shared helper function for local_conv_fft_valid and local_conv_fft_full. # shared helper function for local_conv_fft_valid and local_conv_fft_full.
# we import conv2d_fft locally to avoid pycuda warnings # we import conv2d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv2d_fft from theano.sandbox.cuda.fftconv import conv2d_fft
kwargs = {'border_mode': node.op.border_mode} kwargs = {'border_mode': node.op.border_mode}
if (node.op.imshp is not None and if (node.op.imshp is not None and node.op.imshp[-1] % 2 == 1 and
node.op.imshp[-1] is not None and node.op.imshp[-1] is not None):
node.op.imshp[-1] % 2 == 1):
kwargs['pad_last_dim'] = True kwargs['pad_last_dim'] = True
# If the user supplied the full nonsymbolic image_shape and # If the user supplied the full nonsymbolic image_shape and
...@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node): ...@@ -1459,9 +1455,8 @@ def _gpu_conv_to_fftconv(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_fft_valid(node): def local_conv_fft_valid(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.border_mode == 'valid' and if (node.op.border_mode == 'valid' and node.op.fft_opt and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1)):
node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return False return False
...@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node): ...@@ -1470,9 +1465,8 @@ def local_conv_fft_valid(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_fft_full(node): def local_conv_fft_full(node):
if isinstance(node.op, GpuConv): if isinstance(node.op, GpuConv):
if (node.op.border_mode == 'full' and if (node.op.border_mode == 'full' and node.op.fft_opt and
node.op.subsample == (1, 1) and node.op.subsample == (1, 1)):
node.op.fft_opt):
return [_gpu_conv_to_fftconv(node)] return [_gpu_conv_to_fftconv(node)]
return return
...@@ -1586,7 +1580,7 @@ def local_gpu_conv(node): ...@@ -1586,7 +1580,7 @@ def local_gpu_conv(node):
@local_optimizer([GpuConv]) @local_optimizer([GpuConv])
def local_conv_gemm(node): def local_conv_gemm(node):
if (isinstance(node.op, GpuConv) and if (isinstance(node.op, GpuConv) and
node.op.border_mode in ['full', 'valid']): node.op.border_mode in ['full', 'valid']):
img, kern = node.inputs img, kern = node.inputs
border_mode = node.op.border_mode border_mode = node.op.border_mode
...@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10, ...@@ -1659,7 +1653,6 @@ conv_groupopt.register('conv_fft_full', local_conv_fft_full, 10,
'conv_fft') 'conv_fft')
# cuDNN is the second, but only registered if cuDNN is available. # cuDNN is the second, but only registered if cuDNN is available.
# It can be disabled by excluding 'conv_dnn' or 'cudnn'. # It can be disabled by excluding 'conv_dnn' or 'cudnn'.
from . import dnn
# We can't check at import if dnn is available, so we must always # We can't check at import if dnn is available, so we must always
# register it. This do not cause problem as if it is not avail, the # register it. This do not cause problem as if it is not avail, the
# opt will do nothing. # opt will do nothing.
...@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer): ...@@ -1708,9 +1701,8 @@ class ConvMetaOptimizer(LocalCudaMetaOptimizer):
shapes = ((node.op.bsize,) + node.op.imshp, shapes = ((node.op.bsize,) + node.op.imshp,
(node.op.nkern, nchannels) + node.op.kshp) (node.op.nkern, nchannels) + node.op.kshp)
for (var, shape) in zip(vars, shapes): for (var, shape) in zip(vars, shapes):
if ((var in inputs) and if ((var in inputs) and (shape is not None) and
(shape is not None) and not any(s is None for s in shape)):
not any(s is None for s in shape)):
result[var] = theano.shared( result[var] = theano.shared(
# TODO: Use var.type.filter when cuda_ndarray.filter # TODO: Use var.type.filter when cuda_ndarray.filter
...@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node): ...@@ -1763,8 +1755,6 @@ def local_conv3d_fft(node):
gpu_optimizer.register("conv3d_fft", local_conv3d_fft) gpu_optimizer.register("conv3d_fft", local_conv3d_fft)
from theano.tensor.nnet.ConvGrad3D import ConvGrad3D
@local_optimizer([ConvGrad3D]) @local_optimizer([ConvGrad3D])
def local_convgrad3d_fft(node): def local_convgrad3d_fft(node):
...@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node): ...@@ -1775,7 +1765,7 @@ def local_convgrad3d_fft(node):
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
return False return False
if (isinstance(node.op, ConvGrad3D) and if (isinstance(node.op, ConvGrad3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
...@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node): ...@@ -1794,8 +1784,6 @@ def local_convgrad3d_fft(node):
gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft) gpu_optimizer.register("convgrad3d_fft", local_convgrad3d_fft)
from theano.tensor.nnet.ConvTransp3D import ConvTransp3D
@local_optimizer([ConvTransp3D]) @local_optimizer([ConvTransp3D])
def local_convtransp3d_fft(node): def local_convtransp3d_fft(node):
...@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node): ...@@ -1806,7 +1794,7 @@ def local_convtransp3d_fft(node):
except tensor.NotScalarConstantError: except tensor.NotScalarConstantError:
return False return False
if (isinstance(node.op, ConvTransp3D) and if (isinstance(node.op, ConvTransp3D) and
(stride_x, stride_y, stride_z) == (1, 1, 1)): (stride_x, stride_y, stride_z) == (1, 1, 1)):
# we import conv3d_fft locally to avoid pycuda warnings # we import conv3d_fft locally to avoid pycuda warnings
from theano.sandbox.cuda.fftconv import conv3d_fft from theano.sandbox.cuda.fftconv import conv3d_fft
# Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t) # Shuffle filters from (oc, 0, 1, t, ic) to (ic, oc, 0, 1, t)
...@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node): ...@@ -1894,15 +1882,11 @@ def local_convtransp3d_gemm(node):
gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm) gpu_optimizer.register("convtransp3d_gemm", local_convtransp3d_gemm)
# Pooling
import theano.tensor.signal.pool as pool
@register_opt() @register_opt()
@local_optimizer([pool.Pool]) @local_optimizer([pool.Pool])
def local_gpu_downsample_factor_max(node): def local_gpu_downsample_factor_max(node):
if (isinstance(node.op, pool.Pool) if (isinstance(node.op, pool.Pool) and
and node.op.ds == node.op.st): node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
...@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node): ...@@ -1917,14 +1901,12 @@ def local_gpu_downsample_factor_max(node):
@register_opt() @register_opt()
@local_optimizer([pool.MaxPoolGrad]) @local_optimizer([pool.MaxPoolGrad])
def local_gpu_downsample_factor_max_grad(node): def local_gpu_downsample_factor_max_grad(node):
if (isinstance(node.op, pool.MaxPoolGrad) and if (isinstance(node.op, pool.MaxPoolGrad) and node.op.ds == node.op.st):
node.op.ds == node.op.st):
assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding', assert node.op.__props__ == ('ds', 'ignore_border', 'st', 'padding',
'mode') 'mode')
if (node.op.padding != (0, 0) or if (node.op.padding != (0, 0) or
node.op.mode != 'max' or node.op.mode != 'max' or
node.op.st != node.op.ds): node.op.st != node.op.ds):
return return
x, z, gz = node.inputs x, z, gz = node.inputs
...@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node): ...@@ -1955,9 +1937,6 @@ def local_gpu_downsample_factor_max_grad_grad(node):
as_cuda_ndarray_variable(gx)))] as_cuda_ndarray_variable(gx)))]
from theano.sandbox.cuda.basic_ops import gpu_join, GpuJoin
@register_opt() @register_opt()
@local_optimizer([tensor.Join]) @local_optimizer([tensor.Join])
def local_gpu_join(node): def local_gpu_join(node):
...@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node): ...@@ -2252,8 +2231,8 @@ def local_gpualloc_memset_0(node):
if isinstance(node.op, GpuAlloc) and not node.op.memset_0: if isinstance(node.op, GpuAlloc) and not node.op.memset_0:
inp = node.inputs[0] inp = node.inputs[0]
if (isinstance(inp, CudaNdarrayConstant) and if (isinstance(inp, CudaNdarrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(numpy.asarray(inp.data) == 0).all()): (numpy.asarray(inp.data) == 0).all()):
new_out = GpuAlloc(memset_0=True)(*node.inputs) new_out = GpuAlloc(memset_0=True)(*node.inputs)
old_bcast = node.outputs[0].type.broadcastable old_bcast = node.outputs[0].type.broadcastable
...@@ -2308,8 +2287,9 @@ def local_gpu_eye(node): ...@@ -2308,8 +2287,9 @@ def local_gpu_eye(node):
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, tensor.Eye) and isinstance(host_input.owner.op, tensor.Eye) and
host_input.owner.op.dtype == "float32"): host_input.owner.op.dtype == "float32"):
if tensor.extract_constant(host_input.owner.inputs[2]) != 0: if tensor.extract_constant(host_input.owner.inputs[2]) != 0:
return return
return [gpu_eye(*host_input.owner.inputs)] return [gpu_eye(*host_input.owner.inputs)]
...@@ -2324,7 +2304,7 @@ def local_gpu_eye(node): ...@@ -2324,7 +2304,7 @@ def local_gpu_eye(node):
def safe_to_gpu(x): def safe_to_gpu(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
return as_cuda_ndarray_variable(x) return as_cuda_ndarray_variable(x)
else: else:
...@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None): ...@@ -2379,7 +2359,7 @@ def gpu_reconstruct_graph(inputs, outputs, tag=None):
def tensor_to_cuda(x): def tensor_to_cuda(x):
if (isinstance(x.type, tensor.TensorType) and if (isinstance(x.type, tensor.TensorType) and
x.type.dtype == 'float32'): x.type.dtype == 'float32'):
y = CudaNdarrayType(broadcastable=x.type.broadcastable)() y = CudaNdarrayType(broadcastable=x.type.broadcastable)()
if x.name: if x.name:
...@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node): ...@@ -2437,9 +2417,9 @@ def gpuScanOptimization(node):
if isinstance(node.op, GpuFromHost): if isinstance(node.op, GpuFromHost):
host_input = node.inputs[0] host_input = node.inputs[0]
if (host_input.owner and if (host_input.owner and
isinstance(host_input.owner.op, scan_op.Scan) and isinstance(host_input.owner.op, scan_op.Scan) and
not host_input.owner.op.info['gpu'] and not host_input.owner.op.info['gpu'] and
len(host_input.owner.outputs) == 1): len(host_input.owner.outputs) == 1):
# Note that we are not doing the right thing here !! # Note that we are not doing the right thing here !!
# This is because the local optimizer expects only one # This is because the local optimizer expects only one
...@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node): ...@@ -2492,8 +2472,8 @@ def gpuScanOptimization(node):
return _outputs return _outputs
# scan(host_from_gpu) -> host_from_gpu(GPUscan) # scan(host_from_gpu) -> host_from_gpu(GPUscan)
if (type(node.op) == scan_op.Scan if (type(node.op) == scan_op.Scan and
and not node.op.info['gpu']): not node.op.info['gpu']):
if any([(i.owner and isinstance(i.owner.op, HostFromGpu)) if any([(i.owner and isinstance(i.owner.op, HostFromGpu))
for i in node.inputs]): for i in node.inputs]):
...@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node): ...@@ -2792,7 +2772,7 @@ def local_abstractconv_gemm(node):
kern = kern.dimshuffle(1, 0, 2, 3) kern = kern.dimshuffle(1, 0, 2, 3)
# call GpuCorrMM_gradInputs # call GpuCorrMM_gradInputs
rval = GpuCorrMM_gradInputs('valid', subsample)( rval = GpuCorrMM_gradInputs('valid', subsample)(
gpu_contiguous(kern), gpu_contiguous(img)) gpu_contiguous(kern), gpu_contiguous(img))
else: else:
# need to flip the kernel if necessary # need to flip the kernel if necessary
if node.op.filter_flip: if node.op.filter_flip:
...@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node): ...@@ -2807,11 +2787,11 @@ def local_abstractconv_gemm(node):
# GpuConv does not always store information on the batchsize and # GpuConv does not always store information on the batchsize and
# channels, though, so we only use what information we have.) # channels, though, so we only use what information we have.)
if ((subsample == (1, 1)) and if ((subsample == (1, 1)) and
(node.op.imshp is not None) and (node.op.imshp is not None) and
(None not in node.op.imshp[-2:]) and (None not in node.op.imshp[-2:]) and
(node.op.kshp is not None) and (node.op.kshp is not None) and
(None not in node.op.kshp) and (None not in node.op.kshp) and
border_mode != "half"): border_mode != "half"):
# we know the kernel and output size # we know the kernel and output size
prod1 = node.op.kshp[0] * node.op.kshp[1] prod1 = node.op.kshp[0] * node.op.kshp[1]
prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) * prod2 = ((node.op.imshp[-2] - node.op.kshp[0] + 1) *
......
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
import numpy import numpy
import theano.gof import theano.gof
from theano.compat import PY3 from theano.compat import PY3
...@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt) ...@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, Variable from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config config = theano.config
class CURAND_Base(GpuOp): class CURAND_Base(GpuOp):
""" """
Base class for a random number generator implemented in CURAND. Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by The random number generator itself is an opaque reference managed by
...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp): ...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op. Return a tuple of attributes that define the Op.
""" """
return ( return (self.destructive,
self.destructive,
self.output_type, self.output_type,
self.seed, self.seed,
) )
...@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp): ...@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
def make_node(self, generator, size): def make_node(self, generator, size):
return theano.gof.Apply(self, [generator, size], return theano.gof.Apply(self, [generator, size],
[generator.type(), self.output_type()]) [generator.type(), self.output_type()])
@classmethod @classmethod
def new_auto_update(cls, generator, ndim, dtype, size, seed): def new_auto_update(cls, generator, ndim, dtype, size, seed):
...@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp): ...@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
v_size = theano.tensor.as_tensor_variable(size) v_size = theano.tensor.as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
self = cls( self = cls(output_type=CudaNdarrayType((False,) * ndim),
output_type=CudaNdarrayType((False,) * ndim), seed=seed,
seed=seed, destructive=False)
destructive=False)
o_gen, sample = self(generator, cast(v_size, 'int32')) o_gen, sample = self(generator, cast(v_size, 'int32'))
...@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object): ...@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables. RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable. One caveat is that generators are not serializable.
Parameters Parameters
---------- ----------
seed : int seed : int
...@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object): ...@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
return rval return rval
def uniform(self, size, low=0.0, high=1.0, ndim=None, def uniform(self, size, low=0.0, high=1.0, ndim=None,
dtype=config.floatX): dtype=config.floatX):
""" """
Return symbolic tensor of uniform numbers. Return symbolic tensor of uniform numbers.
...@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object): ...@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable) assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg for i in size]), msg
else: else:
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size) s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size, u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed()) self.next_seed())
self.state_updates.append(u.update) self.state_updates.append(u.update)
rval = u * (high - low) + low rval = u * (high - low) + low
if u.type.broadcastable != rval.type.broadcastable: if u.type.broadcastable != rval.type.broadcastable:
...@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object): ...@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of ' 'Increase the size to match the broadcasting pattern of '
'low and `high` arguments' 'low and `high` arguments'
) )
return rval return rval
def normal(self, size=None, avg=0.0, std=1.0, ndim=None, def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
dtype=config.floatX): dtype=config.floatX):
""" """
Return symbolic tensor of normally-distributed numbers. Return symbolic tensor of normally-distributed numbers.
...@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object): ...@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable) assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg for i in size]), msg
else: else:
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size) s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed()) self.next_seed())
self.state_updates.append(u.update) self.state_updates.append(u.update)
rval = u * std + avg rval = u * std + avg
if u.type.broadcastable != rval.type.broadcastable: if u.type.broadcastable != rval.type.broadcastable:
...@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object): ...@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of `low`' 'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments' 'and `high` arguments'
) )
return rval return rval
@local_optimizer([CURAND_Base]) @local_optimizer([CURAND_Base])
...@@ -386,5 +383,5 @@ def local_destructive(node): ...@@ -386,5 +383,5 @@ def local_destructive(node):
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('CURAND_destructive', optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run', opt.in2out(local_destructive, ignore_newtrees=True),
'inplace') 99, 'fast_run', 'inplace')
...@@ -9,19 +9,20 @@ import numpy ...@@ -9,19 +9,20 @@ import numpy
from six.moves import xrange from six.moves import xrange
import theano import theano
import theano.tensor as T import theano.tensor as T
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
from nose.tools import assert_raises
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn import theano.sandbox.cuda as tcn
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
import theano.sandbox.cuda.basic_ops as B import theano.sandbox.cuda.basic_ops as B
from theano.tensor.basic import _allclose from theano.tensor.basic import _allclose
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.tensor.tests.test_basic
import theano.tensor.tests.test_subtensor
import theano.tensor.tests.test_sharedvar
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -75,8 +76,8 @@ def test_careduce(): ...@@ -75,8 +76,8 @@ def test_careduce():
# The following 2 cases could work if the scalar_op.c_code work with float* dtype. # The following 2 cases could work if the scalar_op.c_code work with float* dtype.
# Currently we have this error: # Currently we have this error:
# error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&' # error: invalid operands of types 'npy_float32' and 'npy_float32' to binary 'operator&'
#(theano.scalar.and_, tensor.elemwise.CAReduce), # (theano.scalar.and_, tensor.elemwise.CAReduce),
#(theano.scalar.or_, tensor.elemwise.CAReduce), # (theano.scalar.or_, tensor.elemwise.CAReduce),
]: ]:
for shape, pattern in [((1, 1), (1,)), for shape, pattern in [((1, 1), (1,)),
((1, 0), (1,)), ((1, 0), (1,)),
...@@ -113,7 +114,7 @@ def test_careduce(): ...@@ -113,7 +114,7 @@ def test_careduce():
((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001 ((4100, 4, 3), [2]), ((5, 4100, 3), [2]), ((5, 4, 4100), [2]), # 001
((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110 ((4100, 4, 3), [0, 1]), ((5, 4100, 3), [0, 1]), ((5, 4, 4100), [0, 1]), # 110
((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011 ((4100, 4, 3), [1, 2]), ((5, 4100, 3), [1, 2]), ((5, 4, 4100), [1, 2]), # 011
((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]), ((4100, 4, 3), [0, 2]), ((5, 4100, 3), [0, 2]), ((5, 4, 4100), [0, 2]),
((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111 ((4100, 4, 3), [0, 1, 2]), ((5, 4100, 3), [0, 1, 2]), ((5, 4, 4100), [0, 1, 2]), # 111
((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111 ((65, 4, 3), [0, 1, 2]), ((5, 65, 3), [0, 1, 2]), ((5, 4, 65), [0, 1, 2]), # 111
...@@ -127,15 +128,15 @@ def test_careduce(): ...@@ -127,15 +128,15 @@ def test_careduce():
((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011 ((4100, 4, 3, 2), [2, 3]), ((4, 4100, 3, 2), [2, 3]), ((4, 3, 4100, 2), [2, 3]), ((4, 3, 2, 4100), [2, 3]), # 0011
((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101 ((4100, 4, 3, 2), [1, 3]), ((4, 4100, 3, 2), [1, 3]), ((4, 3, 4100, 2), [1, 3]), ((4, 3, 2, 4100), [1, 3]), # 0101
((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110 ((4100, 4, 3, 2), [1, 2]), ((4, 4100, 3, 2), [1, 2]), ((4, 3, 4100, 2), [1, 2]), ((4, 3, 2, 4100), [1, 2]), # 0110
((4100,4,3,2),[0,3]),((4,4100,3,2),[0,3]),((4,3,4100,2),[0,3]),((4,3,2,4100),[0,3]),#1001 ((4100, 4, 3, 2), [0, 3]), ((4, 4100, 3, 2), [0, 3]), ((4, 3, 4100, 2), [0, 3]), ((4, 3, 2, 4100), [0, 3]), # 1001
# ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented # ((4100,4,3,2),[0,2]),((4,4100,3,2),[0,2]),((4,3,4100,2),[0,2]),((4,3,2,4100),[0,2]),#1010 not implemented
((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100 ((4100, 4, 3, 2), [0, 1]), ((4, 4100, 3, 2), [0, 1]), ((4, 3, 4100, 2), [0, 1]), ((4, 3, 2, 4100), [0, 1]), # 1100
# reduce over 3d # reduce over 3d
# 3d not tested: 1101, 1110, 1111 # 3d not tested: 1101, 1110, 1111
((4100,4,3,2),[0,1,3]),((4,4100,3,2),[0,1,3]),((4,3,4100,2),[0,1,3]),((4,3,2,4100),[0,1,3]),#1101 ((4100, 4, 3, 2), [0, 1, 3]), ((4, 4100, 3, 2), [0, 1, 3]), ((4, 3, 4100, 2), [0, 1, 3]), ((4, 3, 2, 4100), [0, 1, 3]), # 1101
((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110 ((4100, 4, 3, 2), [0, 1, 2]), ((4, 4100, 3, 2), [0, 1, 2]), ((4, 3, 4100, 2), [0, 1, 2]), ((4, 3, 2, 4100), [0, 1, 2]), # 1110
((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4,3,2,4100),[0,2,3]),#1011 ((4100, 4, 3, 2), [0, 2, 3]), ((4, 4100, 3, 2), [0, 2, 3]), ((4, 3, 4100, 2), [0, 2, 3]), # ((4, 3, 2, 4100), [0, 2, 3]), # 1011
((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111 ((4100, 4, 3, 2), [1, 2, 3]), ((4, 4100, 3, 2), [1, 2, 3]), ((4, 3, 4100, 2), [1, 2, 3]), ((4, 3, 2, 4100), [1, 2, 3]), # 0111
((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111 ((65, 4, 3, 2), [1, 2, 3]), ((4, 65, 3, 2), [1, 2, 3]), ((4, 3, 65, 2), [1, 2, 3]), ((4, 3, 2, 65), [1, 2, 3]), # 0111
...@@ -148,26 +149,26 @@ def test_careduce(): ...@@ -148,26 +149,26 @@ def test_careduce():
]: ]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
b = op(a*a) b = op(a * a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32') val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu) f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu) f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [
for x in f.maker.fgraph.toposort()], ( x.op.__class__ for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
if tcn.GpuElemwise in [x.op.__class__ if(tcn.GpuElemwise in [
for x in f.maker.fgraph.toposort()]: x.op.__class__ for x in f.maker.fgraph.toposort()]):
assert tcn.GpuReshape in [x.op.__class__ assert tcn.GpuReshape in [
for x in f.maker.fgraph.toposort()] x.op.__class__ for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [
for x in f2.maker.fgraph.toposort()], ( x.op.__class__ for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
f_caused_value_error = False f_caused_value_error = False
try: try:
f_out = f(val) f_out = f(val)
...@@ -176,8 +177,9 @@ def test_careduce(): ...@@ -176,8 +177,9 @@ def test_careduce():
f_caused_value_error = True f_caused_value_error = True
except NotImplementedError: except NotImplementedError:
if (numpy.prod(shape) == 0 and if (numpy.prod(shape) == 0 and
getattr(scalar_op, 'identity', None) != 0): getattr(
continue scalar_op, 'identity', None) != 0):
continue
raise raise
f2_caused_value_error = False f2_caused_value_error = False
...@@ -208,45 +210,49 @@ def test_careduce(): ...@@ -208,45 +210,49 @@ def test_careduce():
# example in debug mode with unittests.rseed=9275 # example in debug mode with unittests.rseed=9275
orig_rtol = theano.tensor.basic.float32_rtol orig_rtol = theano.tensor.basic.float32_rtol
theano.tensor.basic.float32_rtol = 2e-5 theano.tensor.basic.float32_rtol = 2e-5
assert _allclose(f_out, f2_out), ('shape', shape, assert _allclose(f_out, f2_out), (
'pattern', pattern, 'shape',
scalar_op, shape,
sum([shape[i] for i in pattern]), 'pattern',
f2(val), f(val), val) pattern, scalar_op,
sum([shape[i] for i in pattern]),
f2(val), f(val), val)
finally: finally:
theano.tensor.basic.float32_rtol = orig_rtol theano.tensor.basic.float32_rtol = orig_rtol
# test with dimshuffle # test with dimshuffle
# we shuffle the 2 outer dims. # we shuffle the 2 outer dims.
for shape, pattern in [ # ((5,),[0]), # for shape, pattern in [((5,), [0]),
((5, 4), [0, 1]), ((5, 4), [0]), for shape, pattern in [((5, 4), [0, 1]), ((5, 4), [0]),
((5, 4, 3), [0]), ((5, 4, 3), [0, 1]), ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]), ((5, 4, 3), [0]), ((5, 4, 3), [0, 1]),
((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]), ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]),
((128, 1, 3, 3), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 1, 2, 3]),
]: ((5, 4, 3, 2), [0, 2, 3]),
((128, 1, 3, 3), [0, 1, 2, 3]), ]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) tensor_pattern_to_gpu_pattern(shape, pattern)
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
dim_pattern = list(range(len(shape))) dim_pattern = list(range(len(shape)))
dim_pattern[0] = 1 dim_pattern[0] = 1
dim_pattern[1] = 0 dim_pattern[1] = 0
a = a.dimshuffle(dim_pattern) a = a.dimshuffle(dim_pattern)
b = op(a*a) b = op(a * a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
val = theano._asarray(val, dtype='float32') val = theano._asarray(val, dtype='float32')
f = theano.function([a], b, mode=mode_with_gpu) f = theano.function([a], b, mode=mode_with_gpu)
f2 = theano.function([a], b, mode=mode_without_gpu) f2 = theano.function([a], b, mode=mode_without_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [
for x in f.maker.fgraph.toposort()], ( x.op.__class__ for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__ assert tcn.GpuElemwise not in [
for x in f.maker.fgraph.toposort()] x.op.__class__ for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [
for x in f2.maker.fgraph.toposort()], ( x.op.__class__ for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert _allclose(f2(val), f(val)), ('shape', shape, assert _allclose(f2(val), f(val)), ('shape', shape,
'pattern', pattern, 'pattern', pattern,
scalar_op, scalar_op,
...@@ -258,16 +264,15 @@ def test_careduce(): ...@@ -258,16 +264,15 @@ def test_careduce():
((5, 4, 3), [0]), ((5, 4, 3), [0, 1]), ((5, 4, 3), [0]), ((5, 4, 3), [0, 1]),
((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]), ((5, 4, 3), [2]), ((5, 4, 3), [0, 1, 2]),
((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]), ((5, 4, 3, 2), [0, 1, 2, 3]), ((5, 4, 3, 2), [0, 2, 3]),
((128, 1, 3, 3), [0, 1, 2, 3]), ((128, 1, 3, 3), [0, 1, 2, 3]), ]:
]:
op = careduce_op(scalar_op, axis=pattern) op = careduce_op(scalar_op, axis=pattern)
pat = tensor_pattern_to_gpu_pattern(shape, pattern) tensor_pattern_to_gpu_pattern(shape, pattern)
shape = numpy.asarray(shape) * 2 shape = numpy.asarray(shape) * 2
a = tensor.TensorType('float32', (False,) * len(shape))() a = tensor.TensorType('float32', (False,) * len(shape))()
a2 = tcn.CudaNdarrayType((False,) * len(shape))() a2 = tcn.CudaNdarrayType((False,) * len(shape))()
b = op(a*a) b = op(a * a)
b2 = op(a2*a2) b2 = op(a2 * a2)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape) val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
# val = numpy.ones(shape) # val = numpy.ones(shape)
# val = numpy.arange(numpy.prod(shape)).reshape(shape) # val = numpy.arange(numpy.prod(shape)).reshape(shape)
...@@ -287,14 +292,14 @@ def test_careduce(): ...@@ -287,14 +292,14 @@ def test_careduce():
val2 = val2[::2, ::2, ::2, ::2] val2 = val2[::2, ::2, ::2, ::2]
f = theano.function([a], b, mode=mode_without_gpu) f = theano.function([a], b, mode=mode_without_gpu)
f2 = theano.function([a2], b2, mode=mode_with_gpu) f2 = theano.function([a2], b2, mode=mode_with_gpu)
assert tcn.GpuCAReduce in [x.op.__class__ assert tcn.GpuCAReduce in [
for x in f2.maker.fgraph.toposort()], ( x.op.__class__ for x in f2.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert tcn.GpuElemwise not in [x.op.__class__ assert tcn.GpuElemwise not in [x.op.__class__
for x in f.maker.fgraph.toposort()] for x in f.maker.fgraph.toposort()]
assert op.__class__ in [x.op.__class__ assert op.__class__ in [x.op.__class__
for x in f.maker.fgraph.toposort()], ( for x in f.maker.fgraph.toposort()], (
scalar_op, shape, pattern) scalar_op, shape, pattern)
assert _allclose(f2(val2), f(val)), ('shape', shape, assert _allclose(f2(val2), f(val)), ('shape', shape,
'pattern', pattern, 'pattern', pattern,
sum([shape[i] for i in pattern])) sum([shape[i] for i in pattern]))
...@@ -374,8 +379,10 @@ def test_reshape(): ...@@ -374,8 +379,10 @@ def test_reshape():
# Test zero dimensions are allowed # Test zero dimensions are allowed
x = T.vector('x') x = T.vector('x')
f_reshp = theano.function([x], x.reshape((0,100)), mode=mode_with_gpu) f_reshp = theano.function(
assert f_reshp(numpy.ndarray((0,), dtype='float32')).shape == (0,100) [x], x.reshape((0, 100)), mode=mode_with_gpu)
assert f_reshp(
numpy.ndarray((0, ), dtype='float32')).shape == (0, 100)
def test_alloc_empty(): def test_alloc_empty():
...@@ -406,7 +413,7 @@ def test_elemwise_empty(): ...@@ -406,7 +413,7 @@ def test_elemwise_empty():
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
f2 = pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu) pfunc([b], [], updates=[(a, a + b)], mode=mode_without_gpu)
a0 = a.get_value() * 1.0 a0 = a.get_value() * 1.0
f(numpy.ones((0, 0), dtype='float32')) f(numpy.ones((0, 0), dtype='float32'))
...@@ -424,8 +431,9 @@ def test_elemwise0(): ...@@ -424,8 +431,9 @@ def test_elemwise0():
f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, a + b)], mode=mode_with_gpu)
# check that we work inplace. # check that we work inplace.
assert (list(f.maker.fgraph.toposort()[1].op.destroy_map.items()) assert (list(
== [(0, [0])]) f.maker.fgraph.toposort()[1].op.destroy_map.items()) == [
(0, [0])])
a0 = a.get_value() * 1.0 a0 = a.get_value() * 1.0
f(numpy.ones((4, 4), dtype='float32')) f(numpy.ones((4, 4), dtype='float32'))
...@@ -495,7 +503,8 @@ def test_elemwise2(): ...@@ -495,7 +503,8 @@ def test_elemwise2():
dtype='float32'), 'a') dtype='float32'), 'a')
b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))() b = tensor.Tensor(dtype='float32', broadcastable=[0] * len(shape))()
f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) * f = pfunc([b], [], updates=[(a, (a + b).dimshuffle([2, 0, 3, 1]) *
tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))], mode=mode_with_gpu) tensor.exp(b ** a).dimshuffle([2, 0, 3, 1]))],
mode=mode_with_gpu)
has_elemwise = False has_elemwise = False
for i, node in enumerate(f.maker.fgraph.toposort()): for i, node in enumerate(f.maker.fgraph.toposort()):
has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise) has_elemwise = has_elemwise or isinstance(node.op, tensor.Elemwise)
...@@ -585,10 +594,11 @@ def test_elemwise_composite_float64(): ...@@ -585,10 +594,11 @@ def test_elemwise_composite_float64():
return l return l
for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
mode_with_gpu.excluding('elemwise_fusion')]: mode_with_gpu.excluding('elemwise_fusion')]:
f = pfunc([a, b], f = pfunc(
tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, [a, b],
b), tensor.cast(
'float32'), mode=mode) tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'),
mode=mode)
out = f(av, bv) out = f(av, bv)
assert numpy.all(out == ((av ** 2) < bv)) assert numpy.all(out == ((av ** 2) < bv))
...@@ -648,11 +658,11 @@ def speed_elemwise_collapse(): ...@@ -648,11 +658,11 @@ def speed_elemwise_collapse():
v = theano._asarray(numpy.random.rand(*shape), dtype='float32') v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
v = v[:, ::2, :, :] v = v[:, ::2, :, :]
v = cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
t1 = time.time() time.time()
for i in range(100): for i in range(100):
# let debugmode catch errors # let debugmode catch errors
f(v) f(v)
t2 = time.time() time.time()
def speed_elemwise_collapse2(): def speed_elemwise_collapse2():
...@@ -672,11 +682,11 @@ def speed_elemwise_collapse2(): ...@@ -672,11 +682,11 @@ def speed_elemwise_collapse2():
v = theano._asarray(numpy.random.rand(*shape), dtype='float32') v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
v = v[:, :, :, ::2] v = v[:, :, :, ::2]
v = cuda_ndarray.CudaNdarray(v) v = cuda_ndarray.CudaNdarray(v)
t1 = time.time() time.time()
for i in range(100): for i in range(100):
# let debugmode catch errors # let debugmode catch errors
f(v) f(v)
t2 = time.time() time.time()
def test_elemwise_collapse(): def test_elemwise_collapse():
...@@ -848,8 +858,8 @@ def test_hostfromgpu_shape_i(): ...@@ -848,8 +858,8 @@ def test_hostfromgpu_shape_i():
ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))() ca = theano.sandbox.cuda.var.CudaNdarrayType((False, False))()
av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32') av = numpy.asarray(numpy.random.rand(5, 4), dtype='float32')
cv = cuda.CudaNdarray(numpy.asarray(numpy.random.rand(5, 4), cv = cuda.CudaNdarray(numpy.asarray(
dtype='float32')) numpy.random.rand(5, 4), dtype='float32'))
f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m) f = theano.function([a], cuda.basic_ops.gpu_from_host(a), mode=m)
assert cuda.basic_ops.gpu_from_host in [x.op assert cuda.basic_ops.gpu_from_host in [x.op
...@@ -880,7 +890,7 @@ def test_gpujoin_assert_cndas(): ...@@ -880,7 +890,7 @@ def test_gpujoin_assert_cndas():
a = theano.shared(_a) a = theano.shared(_a)
try: try:
c = cuda.basic_ops.gpu_join(1, a) cuda.basic_ops.gpu_join(1, a)
# can't "assert False" here, as we want the assertion # can't "assert False" here, as we want the assertion
# error from gpu_join # error from gpu_join
except TypeError: except TypeError:
...@@ -921,13 +931,18 @@ def test_gpujoin_gpualloc(): ...@@ -921,13 +931,18 @@ def test_gpujoin_gpualloc():
b = T.fmatrix('b') b = T.fmatrix('b')
b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32') b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32')
f = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4, f = theano.function(
mode=mode_without_gpu) [a, b],
f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)), T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
mode=mode_with_gpu) mode=mode_without_gpu)
f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a), f_gpu = theano.function(
T.ones_like(b)) + 4, [a, b],
mode=mode_with_gpu) T.join(0, T.zeros_like(a), T.ones_like(b)),
mode=mode_with_gpu)
f_gpu2 = theano.function(
[a, b],
T.join(0, T.zeros_like(a), T.ones_like(b)) + 4,
mode=mode_with_gpu)
assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2 assert sum([node.op == T.alloc for node in f.maker.fgraph.toposort()]) == 2
assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1 assert sum([node.op == T.join for node in f.maker.fgraph.toposort()]) == 1
...@@ -963,9 +978,6 @@ def test_gpualloc_output_to_gpu(): ...@@ -963,9 +978,6 @@ def test_gpualloc_output_to_gpu():
assert numpy.allclose(f(5), f_gpu(5)) assert numpy.allclose(f(5), f_gpu(5))
import theano.tensor.tests.test_basic
class TestAlloc(theano.tensor.tests.test_basic.TestAlloc): class TestAlloc(theano.tensor.tests.test_basic.TestAlloc):
dtype = "float32" dtype = "float32"
mode = mode_with_gpu mode = mode_with_gpu
...@@ -987,7 +999,6 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split): ...@@ -987,7 +999,6 @@ class T_Join_and_Split(theano.tensor.tests.test_basic.T_Join_and_Split):
self.shared = cuda.shared_constructor self.shared = cuda.shared_constructor
import theano.tensor.tests.test_subtensor
# This is to don't duplicate test. # This is to don't duplicate test.
...@@ -1026,7 +1037,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -1026,7 +1037,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
# version when we should. Users should not use it. # version when we should. Users should not use it.
for shape, idx, fast in [((70000,), range(70000), True), for shape, idx, fast in [((70000,), range(70000), True),
((70000, 5), range(70000), True), ((70000, 5), range(70000), True),
((70000, 5), numpy.zeros((0,), 'int64'), ((70000, 5), numpy.zeros((0,), 'int64'),
True), True),
((70000, 2, 3), range(70000), True), ((70000, 2, 3), range(70000), True),
((1025, 1025), [5, 10], True), ((1025, 1025), [5, 10], True),
...@@ -1035,7 +1046,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -1035,7 +1046,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
((3, 10, 68000), [1, 2], True), ((3, 10, 68000), [1, 2], True),
((3, 69000, 11), [1, 2], True), ((3, 69000, 11), [1, 2], True),
# much memory, will be disabled if needed # much memory, will be disabled if needed
((2*10e7,), [-1, 199999999], True), ((2 * 10e7,), [-1, 199999999], True),
((4, 5), [2, 3], True), ((4, 5), [2, 3], True),
((4, 2, 3), [0, 3], True), ((4, 2, 3), [0, 3], True),
((4, 2, 3), [3, 3, 1, 1, 2, ((4, 2, 3), [3, 3, 1, 1, 2,
...@@ -1047,8 +1058,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor): ...@@ -1047,8 +1058,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
# optimized for that case. # optimized for that case.
((4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0, ((4, 4, 2, 3), [3, 3, 1, 1, 2, 2, 0, 0,
-1, -2, -3, -4], False), -1, -2, -3, -4], False),
((1, 10), [0, 0], True), ((1, 10), [0, 0], True), ]:
]:
# If there is not enough memory on the GPU, skip the test # If there is not enough memory on the GPU, skip the test
size_needed = numpy.prod(shape) * (4 + 1) size_needed = numpy.prod(shape) * (4 + 1)
if isinstance(theano.compile.get_default_mode(), if isinstance(theano.compile.get_default_mode(),
...@@ -1106,13 +1116,14 @@ def test_advinc_subtensor1(): ...@@ -1106,13 +1116,14 @@ def test_advinc_subtensor1():
rep[[0, 2]] += yval rep[[0, 2]] += yval
utt.assert_allclose(rval, rep) utt.assert_allclose(rval, rep)
def test_advset_subtensor1(): def test_advset_subtensor1():
""" Test GPU version of set_subtensor on vectors (uses GpuAdvancedIncSubtensor1) """ """ Test GPU version of set_subtensor on vectors (uses GpuAdvancedIncSubtensor1) """
shp = (10,) shp = (10,)
shared = cuda.shared_constructor shared = cuda.shared_constructor
xval = numpy.arange(shp[0], dtype='float32').reshape(shp) + 1 xval = numpy.arange(shp[0], dtype='float32').reshape(shp) + 1
idxs = numpy.array([0,2,5,7,3], dtype='int32') idxs = numpy.array([0, 2, 5, 7, 3], dtype='int32')
yval = numpy.ones(len(idxs), dtype='float32')*10 yval = numpy.ones(len(idxs), dtype='float32') * 10
x = shared(xval, name='x') x = shared(xval, name='x')
y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y') y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y')
expr = T.advanced_set_subtensor1(x, y, idxs) expr = T.advanced_set_subtensor1(x, y, idxs)
...@@ -1124,13 +1135,14 @@ def test_advset_subtensor1(): ...@@ -1124,13 +1135,14 @@ def test_advset_subtensor1():
rep[idxs] = yval rep[idxs] = yval
utt.assert_allclose(rval, rep) utt.assert_allclose(rval, rep)
def test_advset_subtensor1_2d(): def test_advset_subtensor1_2d():
""" Test GPU version of set_subtensor on matrices (uses GpuAdvancedIncSubtensor1_dev20 if compute capability >= 2.0) """ """ Test GPU version of set_subtensor on matrices (uses GpuAdvancedIncSubtensor1_dev20 if compute capability >= 2.0) """
shp = (10,5) shp = (10, 5)
shared = cuda.shared_constructor shared = cuda.shared_constructor
xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1 xval = numpy.arange(numpy.prod(shp), dtype='float32').reshape(shp) + 1
idxs = numpy.array([0,2,5,7,3], dtype='int32') idxs = numpy.array([0, 2, 5, 7, 3], dtype='int32')
yval = numpy.ones((len(idxs), shp[1]), dtype='float32')*10 yval = numpy.ones((len(idxs), shp[1]), dtype='float32') * 10
x = shared(xval, name='x') x = shared(xval, name='x')
y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y') y = T.tensor(dtype='float32', broadcastable=(False,) * len(shp), name='y')
expr = T.advanced_set_subtensor1(x, y, idxs) expr = T.advanced_set_subtensor1(x, y, idxs)
...@@ -1142,37 +1154,38 @@ def test_advset_subtensor1_2d(): ...@@ -1142,37 +1154,38 @@ def test_advset_subtensor1_2d():
rep[idxs] = yval rep[idxs] = yval
utt.assert_allclose(rval, rep) utt.assert_allclose(rval, rep)
def test_inc_subtensor(): def test_inc_subtensor():
shared = cuda.shared_constructor cuda.shared_constructor
#shared = tensor.shared # shared = tensor.shared
x, y = T.fmatrices('x', 'y') x, y = T.fmatrices('x', 'y')
xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], xval = numpy.asarray(
dtype='float32') [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]], yval = numpy.asarray(
dtype='float32') [[10, 10, 10], [10, 10, 10], [10, 10, 10]], dtype='float32')
expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3]) expr = T.inc_subtensor(x[:, 1:3], y[:, 1:3])
f = theano.function([x, y], expr, mode=mode_with_gpu) f = theano.function([x, y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
node.op.set_instead_of_inc == False node.op.set_instead_of_inc is False
for node in f.maker.fgraph.toposort()]) == 1 for node in f.maker.fgraph.toposort()]) == 1
utt.assert_allclose(f(xval, yval), [[1., 12., 13.], utt.assert_allclose(f(xval, yval), [[1., 12., 13.],
[4., 15., 16.], [7., 18., 19.]]) [4., 15., 16.], [7., 18., 19.]])
def test_set_subtensor(): def test_set_subtensor():
shared = cuda.shared_constructor cuda.shared_constructor
#shared = tensor.shared # shared = tensor.shared
x, y = T.fmatrices('x', 'y') x, y = T.fmatrices('x', 'y')
xval = numpy.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], xval = numpy.asarray(
dtype='float32') [[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype='float32')
yval = numpy.asarray([[10, 10, 10], [10, 10, 10], [10, 10, 10]], yval = numpy.asarray(
dtype='float32') [[10, 10, 10], [10, 10, 10], [10, 10, 10]], dtype='float32')
expr = T.set_subtensor(x[:, 1:3], y[:, 1:3]) expr = T.set_subtensor(x[:, 1:3], y[:, 1:3])
f = theano.function([x, y], expr, mode=mode_with_gpu) f = theano.function([x, y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and assert sum([isinstance(node.op, cuda.GpuIncSubtensor) and
node.op.set_instead_of_inc == True node.op.set_instead_of_inc is True
for node in f.maker.fgraph.toposort()]) == 1 for node in f.maker.fgraph.toposort()]) == 1
f(xval, yval) f(xval, yval)
...@@ -1191,7 +1204,7 @@ def test_many_arg_elemwise(): ...@@ -1191,7 +1204,7 @@ def test_many_arg_elemwise():
for arg in xrange(0, num_args)] for arg in xrange(0, num_args)]
symb_args = [theano.tensor.TensorType('float32', symb_args = [theano.tensor.TensorType('float32',
(False,)*nb_dim)() (False,) * nb_dim)()
for arg in xrange(0, num_args)] for arg in xrange(0, num_args)]
outputs = [] outputs = []
...@@ -1313,7 +1326,6 @@ class test_size(unittest.TestCase): ...@@ -1313,7 +1326,6 @@ class test_size(unittest.TestCase):
assert y.size == theano.function([], x.size)() assert y.size == theano.function([], x.size)()
import theano.tensor.tests.test_sharedvar
# This test the case when the shared constructor view an CudaNdarray as input # This test the case when the shared constructor view an CudaNdarray as input
test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester( test_shared_options = theano.tensor.tests.test_sharedvar.makeSharedTester(
shared_constructor_=tcn.shared_constructor, shared_constructor_=tcn.shared_constructor,
...@@ -1374,7 +1386,7 @@ def speed_reduce10(): ...@@ -1374,7 +1386,7 @@ def speed_reduce10():
if __name__ == '__main__': if __name__ == '__main__':
#test_many_arg_elemwise() # test_many_arg_elemwise()
#test_gpujoin_assert_cndas() # test_gpujoin_assert_cndas()
test_advset_subtensor1() test_advset_subtensor1()
test_advset_subtensor1_2d() test_advset_subtensor1_2d()
...@@ -10,7 +10,7 @@ from __future__ import absolute_import, print_function, division ...@@ -10,7 +10,7 @@ from __future__ import absolute_import, print_function, division
# so state is ignored # so state is ignored
# since this job is not restartable, channel is also ignored # since this job is not restartable, channel is also ignored
import logging, time, sys import logging
import numpy import numpy
from six.moves import xrange from six.moves import xrange
...@@ -18,17 +18,22 @@ from six.moves import xrange ...@@ -18,17 +18,22 @@ from six.moves import xrange
import theano import theano
from theano.compile import shared, pfunc from theano.compile import shared, pfunc
from theano import tensor from theano import tensor
from theano.tensor.nnet import softplus
from theano.tensor.nnet.nnet import softsign from theano.tensor.nnet.nnet import softsign
try:
from PIL import Image
except ImportError:
Image = None
# from PIL import Image
_logger = logging.getLogger('theano.sandbox.cuda.tests.test_bench_loopfusion') _logger = logging.getLogger('theano.sandbox.cuda.tests.test_bench_loopfusion')
def _shared_uniform(rng, low, high, size, dtype, name=None): def _shared_uniform(rng, low, high, size, dtype, name=None):
return shared( return shared(
theano._asarray( theano._asarray(
rng.uniform(low=low, high=high, size=size), rng.uniform(low=low, high=high, size=size),
dtype=dtype), name) dtype=dtype),
name)
class Kouh2008(object): class Kouh2008(object):
...@@ -49,8 +54,10 @@ class Kouh2008(object): ...@@ -49,8 +54,10 @@ class Kouh2008(object):
""" """
if len(w_list) != len(x_list): if len(w_list) != len(x_list):
raise ValueError('w_list must have same len as x_list') raise ValueError('w_list must have same len as x_list')
output = (sum(w * tensor.pow(x, p) for (w, x) in zip(w_list, x_list)))\ output = ((sum(w * tensor.pow(x, p)
/ (theano._asarray(eps, dtype=k.type.dtype) + k + tensor.pow(sum(tensor.pow(x, q) for x in x_list), r)) for (w, x) in zip(w_list, x_list))) /
(theano._asarray(eps, dtype=k.type.dtype) + k +
tensor.pow(sum(tensor.pow(x, q) for x in x_list), r)))
assert output.type.ndim == 2 assert output.type.ndim == 2
self.__dict__.update(locals()) self.__dict__.update(locals())
...@@ -80,10 +87,15 @@ class Kouh2008(object): ...@@ -80,10 +87,15 @@ class Kouh2008(object):
w_sm = theano.tensor.nnet.softmax(w) w_sm = theano.tensor.nnet.softmax(w)
w_list = [w_sm[:, i] for i in xrange(n_terms)] w_list = [w_sm[:, i] for i in xrange(n_terms)]
w_l1 = abs(w).sum() w_l1 = abs(w).sum()
w_l2_sqr = (w**2).sum() w_l2_sqr = (w ** 2).sum()
else: else:
w_list = [shared_uniform(low=-2.0/n_terms, high=2.0/n_terms, size=(n_out,), name='w_%i'%i) w_list = [
for i in xrange(n_terms)] shared_uniform(
low=-2.0 / n_terms,
high=2.0 / n_terms,
size=(n_out,),
name='w_%i' % i)
for i in xrange(n_terms)]
w_l1 = sum(abs(wi).sum() for wi in w_list) w_l1 = sum(abs(wi).sum() for wi in w_list)
w_l2_sqr = sum((wi**2).sum() for wi in w_list) w_l2_sqr = sum((wi**2).sum() for wi in w_list)
...@@ -102,19 +114,27 @@ class Kouh2008(object): ...@@ -102,19 +114,27 @@ class Kouh2008(object):
p = tensor.nnet.sigmoid(p_unbounded) * e_range_mag + e_range_low p = tensor.nnet.sigmoid(p_unbounded) * e_range_mag + e_range_low
q = tensor.nnet.sigmoid(q_unbounded) * e_range_mag + e_range_low q = tensor.nnet.sigmoid(q_unbounded) * e_range_mag + e_range_low
r = tensor.nnet.sigmoid(r_unbounded) * \ r = tensor.nnet.sigmoid(r_unbounded) * \
theano._asarray(1.0/e_range_low - 1.0/e_range_high, dtype=dtype) \ theano._asarray(1.0 / e_range_low - 1.0 / e_range_high,
+ theano._asarray(1.0/e_range_high, dtype=dtype) dtype=dtype) + \
theano._asarray(1.0 / e_range_high, dtype=dtype)
k = softsign(k_unbounded) k = softsign(k_unbounded)
if use_softmax_w: if use_softmax_w:
rval = cls(w_list, x_list, p, q, r, k, rval = cls(w_list, x_list, p, q, r, k,
params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded, w] + params, params=[p_unbounded,
updates=updates) q_unbounded,
r_unbounded,
k_unbounded,
w] + params,
updates=updates)
else: else:
rval = cls(w_list, x_list, p, q, r, k, rval = cls(w_list, x_list, p, q, r, k,
params=[p_unbounded, q_unbounded, r_unbounded, k_unbounded] + w_list + params, params=[p_unbounded,
updates=updates) q_unbounded,
r_unbounded,
k_unbounded] + w_list + params,
updates=updates)
rval.p_unbounded = p_unbounded rval.p_unbounded = p_unbounded
rval.q_unbounded = q_unbounded rval.q_unbounded = q_unbounded
rval.r_unbounded = r_unbounded rval.r_unbounded = r_unbounded
...@@ -126,8 +146,10 @@ class Kouh2008(object): ...@@ -126,8 +146,10 @@ class Kouh2008(object):
return rval return rval
@classmethod @classmethod
def new_filters_expbounds(cls, rng, input, n_in, n_out, n_terms, dtype=None, eps=1e-1, def new_filters_expbounds(cls, rng, input, n_in, n_out, n_terms,
exponent_range=(1.0, 3.0), filter_range=1.0): dtype=None, eps=1e-1,
exponent_range=(1.0, 3.0),
filter_range=1.0):
"""Return a KouhLayer instance with random parameters """Return a KouhLayer instance with random parameters
The parameters are drawn on a range [typically] suitable for fine-tuning by gradient The parameters are drawn on a range [typically] suitable for fine-tuning by gradient
...@@ -161,19 +183,30 @@ class Kouh2008(object): ...@@ -161,19 +183,30 @@ class Kouh2008(object):
def shared_uniform(low, high, size, name): def shared_uniform(low, high, size, name):
return _shared_uniform(rng, low, high, size, dtype, name) return _shared_uniform(rng, low, high, size, dtype, name)
f_list = [shared_uniform(low=-2.0/numpy.sqrt(n_in), high=2.0/numpy.sqrt(n_in), size=(n_in, n_out), name='f_%i'%i) f_list = [shared_uniform(low=-2.0 / numpy.sqrt(n_in),
for i in xrange(n_terms)] high=2.0 / numpy.sqrt(n_in),
size=(n_in, n_out),
b_list = [shared_uniform(low=0, high=.01, size=(n_out,), name='b_%i'%i) name='f_%i' % i)
for i in xrange(n_terms)] for i in xrange(n_terms)]
#x_list = [theano._asarray(eps, dtype=dtype)+softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
b_list = [shared_uniform(low=0,
high=.01,
size=(n_out,),
name='b_%i' % i)
for i in xrange(n_terms)]
# x_list = [theano._asarray(eps, dtype=dtype) + softplus(tensor.dot(input, f_list[i])) for i in xrange(n_terms)]
filter_range = theano._asarray(filter_range, dtype=dtype) filter_range = theano._asarray(filter_range, dtype=dtype)
half_filter_range = theano._asarray(filter_range/2, dtype=dtype) half_filter_range = theano._asarray(filter_range / 2,
x_list = [theano._asarray(filter_range + eps, dtype=dtype)+half_filter_range * softsign(tensor.dot(input, f_list[i]) + dtype=dtype)
b_list[i]) for i in xrange(n_terms)] x_list = [
theano._asarray(filter_range + eps, dtype=dtype) +
rval = cls.new_expbounds(rng, x_list, n_out, dtype=dtype, params=f_list + b_list, half_filter_range * softsign(
exponent_range=exponent_range) tensor.dot(input, f_list[i]) + b_list[i])
for i in xrange(n_terms)]
rval = cls.new_expbounds(
rng, x_list, n_out, dtype=dtype, params=f_list + b_list,
exponent_range=exponent_range)
rval.f_list = f_list rval.f_list = f_list
rval.input = input # add the input to the returned object rval.input = input # add the input to the returned object
rval.filter_l1 = sum(abs(fi).sum() for fi in f_list) rval.filter_l1 = sum(abs(fi).sum() for fi in f_list)
...@@ -183,6 +216,8 @@ class Kouh2008(object): ...@@ -183,6 +216,8 @@ class Kouh2008(object):
def img_from_weights(self, rows=None, cols=None, row_gap=1, col_gap=1, eps=1e-4): def img_from_weights(self, rows=None, cols=None, row_gap=1, col_gap=1, eps=1e-4):
""" Return an image that visualizes all the weights in the layer. """ Return an image that visualizes all the weights in the layer.
""" """
if Image is None:
raise ImportError("No module named PIL")
n_in, n_out = self.f_list[0].value.shape n_in, n_out = self.f_list[0].value.shape
...@@ -190,10 +225,12 @@ class Kouh2008(object): ...@@ -190,10 +225,12 @@ class Kouh2008(object):
rows = int(numpy.sqrt(n_out)) rows = int(numpy.sqrt(n_out))
if cols is None: if cols is None:
cols = n_out // rows cols = n_out // rows
if n_out % rows: cols += 1 if n_out % rows:
cols += 1
if rows is None: if rows is None:
rows = n_out // cols rows = n_out // cols
if n_out % cols: rows += 1 if n_out % cols:
rows += 1
filter_shape = self.filter_shape filter_shape = self.filter_shape
height = rows * (row_gap + filter_shape[0]) - row_gap height = rows * (row_gap + filter_shape[0]) - row_gap
...@@ -203,34 +240,40 @@ class Kouh2008(object): ...@@ -203,34 +240,40 @@ class Kouh2008(object):
w = self.w.value w = self.w.value
w_col = 0 w_col = 0
def pixel_range(x): def pixel_range(x):
return 255 * (x - x.min()) / (x.max() - x.min() + eps) return 255 * (x - x.min()) / (x.max() - x.min() + eps)
for r in xrange(rows): for r in xrange(rows):
out_r_low = r*(row_gap + filter_shape[0]) out_r_low = r * (row_gap + filter_shape[0])
out_r_high = out_r_low + filter_shape[0] out_r_high = out_r_low + filter_shape[0]
for c in xrange(cols): for c in xrange(cols):
out_c_low = c*(col_gap + filter_shape[1]) out_c_low = c * (col_gap + filter_shape[1])
out_c_high = out_c_low + filter_shape[1] out_c_high = out_c_low + filter_shape[1]
out_tile = out_array[out_r_low:out_r_high, out_c_low:out_c_high, :] out_tile = out_array[out_r_low:out_r_high,
out_c_low:out_c_high,
:]
if c % 3 == 0: # linear filter if c % 3 == 0: # linear filter
if w_col < w.shape[1]: if w_col < w.shape[1]:
out_tile[...] = pixel_range(w[:, w_col]).reshape(filter_shape+(1,)) out_tile[...] = pixel_range(
w[:, w_col]).reshape(filter_shape + (1,))
w_col += 1 w_col += 1
if c % 3 == 1: # E filters if c % 3 == 1: # E filters
if w_col < w.shape[1]: if w_col < w.shape[1]:
# filters after the 3rd do not get rendered, but are skipped over. # filters after the 3rd do not get rendered, but are skipped over.
# there are only 3 colour channels. # there are only 3 colour channels.
for i in xrange(min(self.n_E_quadratic, 3)): for i in xrange(min(self.n_E_quadratic, 3)):
out_tile[:, :, i] = pixel_range(w[:, w_col+i]).reshape(filter_shape) out_tile[:, :, i] = pixel_range(
w[:, w_col + i]).reshape(filter_shape)
w_col += self.n_E_quadratic w_col += self.n_E_quadratic
if c % 3 == 2: # S filters if c % 3 == 2: # S filters
if w_col < w.shape[1]: if w_col < w.shape[1]:
# filters after the 3rd do not get rendered, but are skipped over. # filters after the 3rd do not get rendered, but are skipped over.
# there are only 3 colour channels. # there are only 3 colour channels.
for i in xrange(min(self.n_S_quadratic, 3)): for i in xrange(min(self.n_S_quadratic, 3)):
out_tile[:, :, 2-i] = pixel_range(w[:, w_col+i]).reshape(filter_shape) out_tile[:, :, 2 - i] = pixel_range(
w[:, w_col + i]).reshape(filter_shape)
w_col += self.n_S_quadratic w_col += self.n_S_quadratic
return Image.fromarray(out_array, 'RGB') return Image.fromarray(out_array, 'RGB')
...@@ -264,8 +307,9 @@ class Config(object): ...@@ -264,8 +307,9 @@ class Config(object):
ft_batchsize = 30 ft_batchsize = 30
ft_epoch_len = 50000 ft_epoch_len = 50000
ft_status_interval = 50 # property( lambda s:s.ft_epoch_len/s.ft_batchsize) ft_status_interval = 50 # property(lambda s:s.ft_epoch_len/s.ft_batchsize)
ft_validation_interval = property( lambda s: s.ft_epoch_len/s.ft_batchsize) ft_validation_interval = property(
lambda s: s.ft_epoch_len / s.ft_batchsize)
ft_ntrain_limit = 0 ft_ntrain_limit = 0
ft_test_lag1 = True ft_test_lag1 = True
...@@ -290,14 +334,15 @@ if 0: ...@@ -290,14 +334,15 @@ if 0:
debug = False debug = False
if isinstance(theano.compile.mode.get_default_mode(), if isinstance(theano.compile.mode.get_default_mode(),
theano.compile.debugmode.DebugMode): theano.compile.debugmode.DebugMode):
debug = True debug = True
# get symbolic train set # get symbolic train set
s_lr = theano.tensor.fscalar() s_lr = theano.tensor.fscalar()
if not debug: if not debug:
sshape = (None, 784) sshape = (None, 784)
else: sshape = (None, 3) else:
sshape = (None, 3)
x = theano.tensor.TensorType(dtype=conf.dtype, broadcastable=(0, 0), shape=sshape)() x = theano.tensor.TensorType(dtype=conf.dtype, broadcastable=(0, 0), shape=sshape)()
y = theano.tensor.lvector() y = theano.tensor.lvector()
...@@ -315,7 +360,8 @@ if 0: ...@@ -315,7 +360,8 @@ if 0:
print(layer.params) print(layer.params)
gparams = theano.tensor.grad(cost, layer.params) gparams = theano.tensor.grad(cost, layer.params)
updates = [(p, p - s_lr*gp) for p, gp in zip(layer.params, gparams)] updates = [
(p, p - s_lr * gp) for p, gp in zip(layer.params, gparams)]
train_nll = pfunc([x, y, s_lr], [], updates=updates) train_nll = pfunc([x, y, s_lr], [], updates=updates)
......
...@@ -8,31 +8,31 @@ from theano import tensor ...@@ -8,31 +8,31 @@ from theano import tensor
from theano.tests import unittest_tools from theano.tests import unittest_tools
import numpy import numpy
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda as tcn import theano.sandbox.cuda as tcn
from theano.tensor.signal.pool import (Pool,
PoolGrad, DownsampleFactorMaxGradGrad)
import theano.compile.mode import theano.compile.mode
from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer from theano.tensor.tests.test_blas import BaseGemv, TestBlasStrides, TestGer
from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace from theano.sandbox.cuda.blas import gpu_gemv_no_inplace, gpu_gemv_inplace
from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace from theano.sandbox.cuda.blas import gpu_ger_inplace, gpu_ger_no_inplace
from theano.sandbox.cuda.blas import batched_dot, GpuBatchedDot from theano.sandbox.cuda.blas import batched_dot, GpuBatchedDot
from theano.tensor.signal.pool import (Pool, PoolGrad, DownsampleFactorMaxGradGrad)
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode(
'FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode( mode_without_gpu = theano.compile.mode.get_mode(
'FAST_RUN').excluding('gpu') 'FAST_RUN').excluding('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode(
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpu') ).including('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode(
).excluding('gpu')
# The CPU tests already compare C/Py, so we only check C/GPU # The CPU tests already compare C/Py, so we only check C/GPU
mode_with_gpu = copy.copy(mode_with_gpu) mode_with_gpu = copy.copy(mode_with_gpu)
...@@ -55,73 +55,81 @@ class TestBatchedDot(unittest_tools.InferShapeTester): ...@@ -55,73 +55,81 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a=numpy.random.randn(*a_shp).astype(numpy.float32) a = numpy.random.randn(* a_shp).astype(numpy.float32)
b=numpy.random.randn(*b_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32)
x=tensor.ftensor3() x = tensor.ftensor3()
y=tensor.ftensor3() y = tensor.ftensor3()
f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu) f = theano.function([x, y],
batched_dot(x, y),
mode=mode_with_gpu)
z0=numpy.asarray(f(a,b)) z0 = numpy.asarray(f(a, b))
ga = cuda_ndarray.CudaNdarray(a) ga = cuda_ndarray.CudaNdarray(a)
gb = cuda_ndarray.CudaNdarray(b) gb = cuda_ndarray.CudaNdarray(b)
z1=numpy.asarray(f(ga,gb)) z1 = numpy.asarray(f(ga, gb))
z_test = numpy.sum(a[:,:,:,None]*b[:,None,:,:],axis=-2) z_test = numpy.sum(
a[:, :, :, None] * b[:, None, :, :], axis=-2)
z1 = numpy.asarray(f(ga, gb))
z_test = numpy.sum(
a[:, :, :, None] * b[:, None, :, :], axis=-2)
unittest_tools.assert_allclose(z0, z_test) unittest_tools.assert_allclose(z0, z_test)
unittest_tools.assert_allclose(z1, z_test) unittest_tools.assert_allclose(z1, z_test)
cmp((5,4,3), (5,3,2)) cmp((5, 4, 3), (5, 3, 2))
cmp((5,3,3), (5,3,3)) cmp((5, 3, 3), (5, 3, 3))
cmp((5,2,6), (5,6,3)) cmp((5, 2, 6), (5, 6, 3))
# Test dimensions of 0 # Test dimensions of 0
cmp((0,2,6), (0,6,3)) cmp((0, 2, 6), (0, 6, 3))
cmp((5,0,3), (5,3,2)) cmp((5, 0, 3), (5, 3, 2))
cmp((5,4,0), (5,0,2)) cmp((5, 4, 0), (5, 0, 2))
cmp((5,4,3), (5,3,0)) cmp((5, 4, 3), (5, 3, 0))
cmp((0,0,0), (0,0,0)) cmp((0, 0, 0), (0, 0, 0))
# Test dimensions of 1 # Test dimensions of 1
cmp((1,2,6), (1,6,3)) cmp((1, 2, 6), (1, 6, 3))
cmp((5,1,3), (5,3,2)) cmp((5, 1, 3), (5, 3, 2))
cmp((5,4,1), (5,1,2)) cmp((5, 4, 1), (5, 1, 2))
cmp((5,4,3), (5,3,1)) cmp((5, 4, 3), (5, 3, 1))
def test_batched_dot_errors(self): def test_batched_dot_errors(self):
def fail(a_shp, b_shp): def fail(a_shp, b_shp):
a=numpy.random.randn(*a_shp).astype(numpy.float32) a = numpy.random.randn(* a_shp).astype(numpy.float32)
b=numpy.random.randn(*b_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32)
x=tensor.ftensor3() x = tensor.ftensor3()
y=tensor.ftensor3() y = tensor.ftensor3()
f=theano.function([x,y], batched_dot(x,y), mode=mode_with_gpu) f = theano.function([x, y],
batched_dot(x, y),
mode=mode_with_gpu)
z = f(a,b) f(a, b)
# Different batch size # Different batch size
self.assertRaises(RuntimeError, fail, (5,4,3), (6,3,2)) self.assertRaises(RuntimeError, fail, (5, 4, 3), (6, 3, 2))
# Shape mismatch # Shape mismatch
self.assertRaises(RuntimeError, fail, (5,4,3), (5,2,2)) self.assertRaises(RuntimeError, fail, (5, 4, 3), (5, 2, 2))
def test_batched_dot_gradient(self): def test_batched_dot_gradient(self):
for threshold in [0, 100]: unittest_tools.verify_grad(
unittest_tools.verify_grad( batched_dot, [
GpuBatchedDot(stream_threshold=threshold), numpy.random.randn(5, 7, 2).astype(numpy.float32),
[numpy.random.randn(5,7,2).astype(numpy.float32), numpy.random.randn(5, 2, 6).astype(numpy.float32)],
numpy.random.randn(5,2,6).astype(numpy.float32)], mode=mode_with_gpu)
mode=mode_with_gpu)
def test_infer_shape(self): def test_infer_shape(self):
# only matrix/matrix is supported # only matrix / matrix is supported
admat = tensor.ftensor3() admat = tensor.ftensor3()
bdmat = tensor.ftensor3() bdmat = tensor.ftensor3()
admat_val = my_rand(7, 4, 5) admat_val = my_rand(7, 4, 5)
...@@ -134,24 +142,23 @@ class TestBatchedDot(unittest_tools.InferShapeTester): ...@@ -134,24 +142,23 @@ class TestBatchedDot(unittest_tools.InferShapeTester):
def test_dot22(): def test_dot22():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a0 = my_rand(*a_shp) a0 = my_rand(* a_shp)
a = tcn.shared_constructor(a0, 'a') a = tcn.shared_constructor(a0, 'a')
b = tensor.fmatrix() b = tensor.fmatrix()
f = pfunc([b], [], updates=[(a, tensor.dot(a, b))], mode=mode_with_gpu) f = pfunc([b], [], updates=[(a, tensor.dot(a, b))], mode=mode_with_gpu)
bval = my_rand(*b_shp) bval = my_rand(* b_shp)
f(bval) f(bval)
assert numpy.allclose(numpy.dot(a0, bval), a.get_value()) assert numpy.allclose(numpy.dot(a0, bval), a.get_value())
# Try with a matrix equal to a0, but with strides in both dims # Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0) a.set_value(a0)
a.set_value( a.set_value(a.get_value(borrow=True,
a.get_value(borrow=True,
return_internal_type=True)[::-1, ::-1], return_internal_type=True)[::-1, ::-1],
borrow=True) borrow=True)
f(bval) f(bval)
cmp((3, 4), (4, 5)) cmp((3, 4), (4, 5))
...@@ -171,12 +178,12 @@ def test_dot22scalar(): ...@@ -171,12 +178,12 @@ def test_dot22scalar():
bv = my_rand(*b_shp) bv = my_rand(*b_shp)
f = theano.function( f = theano.function(
[a, b], [a, b],
tensor.dot(a, b) * numpy.asarray(4, 'float32'), tensor.dot(a, b) * numpy.asarray(4, 'float32'),
mode=mode_with_gpu) mode=mode_with_gpu)
f2 = theano.function( f2 = theano.function(
[a, b], [a, b],
tensor.dot(a, b) * numpy.asarray(4, 'float32')) tensor.dot(a, b) * numpy.asarray(4, 'float32'))
t = f.maker.fgraph.toposort() t = f.maker.fgraph.toposort()
assert any([isinstance(n.op, tcn.blas.GpuDot22Scalar) for n in t]) assert any([isinstance(n.op, tcn.blas.GpuDot22Scalar) for n in t])
# assert any([isinstance(n.op, tcn.basic_ops.GpuAllocEmpty) # assert any([isinstance(n.op, tcn.basic_ops.GpuAllocEmpty)
...@@ -220,23 +227,22 @@ def test_gemm(): ...@@ -220,23 +227,22 @@ def test_gemm():
c = tensor.fmatrix('c') c = tensor.fmatrix('c')
f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))],
mode=mode_with_gpu) mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_inplace assert any([node.op == tcn.blas.gpu_gemm_inplace
for node in f.maker.fgraph.toposort()]) for node in f.maker.fgraph.toposort()])
bval = my_rand(*b_shp) bval = my_rand(* b_shp)
cval = my_rand(a_shp[0], b_shp[1]) cval = my_rand(a_shp[0], b_shp[1])
f(bval, cval) f(bval, cval)
assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval), assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval),
a.get_value()) a.get_value())
# Try with a matrix equal to a0, but with strides in both dims # Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0) a.set_value(a0)
a.set_value( a.set_value(a.get_value(borrow=True,
a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1],
return_internal_type=True)[::-1, ::-1], borrow=True)
borrow=True)
f(bval, cval) f(bval, cval)
cmp((3, 4), (4, 5)) cmp((3, 4), (4, 5))
...@@ -250,7 +256,7 @@ def test_gemm(): ...@@ -250,7 +256,7 @@ def test_gemm():
def test_gemm_no_inplace(): def test_gemm_no_inplace():
def cmp(a_shp, b_shp): def cmp(a_shp, b_shp):
a0 = my_rand(*a_shp) a0 = my_rand(* a_shp)
a = tcn.shared_constructor(a0, 'a') a = tcn.shared_constructor(a0, 'a')
cval = my_rand(a_shp[0], b_shp[1]) cval = my_rand(a_shp[0], b_shp[1])
c = tcn.shared_constructor(cval.copy(), 'c') c = tcn.shared_constructor(cval.copy(), 'c')
...@@ -258,14 +264,13 @@ def test_gemm_no_inplace(): ...@@ -258,14 +264,13 @@ def test_gemm_no_inplace():
b = tcn.fmatrix('b') b = tcn.fmatrix('b')
b2 = tcn.fmatrix('b2') b2 = tcn.fmatrix('b2')
f = pfunc( f = pfunc([b, b2],
[b, b2], [tensor.dot(a, b2) + c],
[tensor.dot(a, b2) + c], updates=[(a, tensor.dot(a, b) + c)],
updates=[(a, tensor.dot(a, b) + c)], mode=mode_with_gpu)
mode=mode_with_gpu)
assert any([node.op == tcn.blas.gpu_gemm_no_inplace assert any([node.op == tcn.blas.gpu_gemm_no_inplace
for node in f.maker.fgraph.toposort()]) for node in f.maker.fgraph.toposort()])
bval = my_rand(*b_shp) bval = my_rand(*b_shp)
bval2 = my_rand(*b_shp) bval2 = my_rand(*b_shp)
rval = f(bval, bval2) rval = f(bval, bval2)
...@@ -276,9 +281,10 @@ def test_gemm_no_inplace(): ...@@ -276,9 +281,10 @@ def test_gemm_no_inplace():
# Try with a matrix equal to a0, but with strides in both dims # Try with a matrix equal to a0, but with strides in both dims
a.set_value(a0) a.set_value(a0)
a.set_value( a.set_value(
a.get_value(borrow=True, a.get_value(
return_internal_type=True)[::-1, ::-1], borrow=True,
borrow=True) return_internal_type=True)[::-1, ::-1],
borrow=True)
f(bval, bval2) f(bval, bval2)
cmp((3, 4), (4, 5)) cmp((3, 4), (4, 5))
...@@ -303,8 +309,8 @@ if 0: ...@@ -303,8 +309,8 @@ if 0:
def test_maxpool(): def test_maxpool():
"""TODO: test the gpu version!!! """ """TODO: test the gpu version!!! """
for d0, d1, r_true, r_false in [(4, 4, [[[[5, 7], [13, 15]]]], [[[[5, 7], [13, 15]]]]), for d0, d1, r_true, r_false in [(4, 4, [[[[5, 7], [13, 15]]]], [[[[5, 7], [13, 15]]]]),
(5, 5, [[[[6, 8], [ 16, 18], [ 21, 23]]]], (5, 5, [[[[6, 8], [16, 18], [21, 23]]]],
[[[[6, 8, 9], [ 16, 18, 19], [ 21, 23, 24]]]])]: [[[[6, 8, 9], [16, 18, 19], [21, 23, 24]]]])]:
for border, ret in [(True, r_true), (False, r_false)]: for border, ret in [(True, r_true), (False, r_false)]:
ret = numpy.array(ret) ret = numpy.array(ret)
a = tcn.blas.Pool((2, 2), border) a = tcn.blas.Pool((2, 2), border)
...@@ -312,7 +318,7 @@ if 0: ...@@ -312,7 +318,7 @@ if 0:
b = dmatrix4() b = dmatrix4()
f = pfunc([b], [a(b)], mode=mode_with_gpu) f = pfunc([b], [a(b)], mode=mode_with_gpu)
bval = numpy.arange(0, d0*d1).reshape(1, 1, d0, d1) bval = numpy.arange(0, d0 * d1).reshape(1, 1, d0, d1)
r = f(bval)[0] r = f(bval)[0]
# print bval, bval.shape, border # print bval, bval.shape, border
# print r, r.shape # print r, r.shape
...@@ -347,8 +353,7 @@ def test_downsample(): ...@@ -347,8 +353,7 @@ def test_downsample():
(1, 1, 1025, 10), (1, 1, 1025, 10),
(1, 1, 1023, 10), (1, 1, 1023, 10),
(65536, 1, 10, 10), (65536, 1, 10, 10),
(1, 65536, 10, 10), (1, 65536, 10, 10), ]
]
numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps) numpy.random.RandomState(unittest_tools.fetch_seed()).shuffle(shps)
...@@ -368,14 +373,14 @@ def test_downsample(): ...@@ -368,14 +373,14 @@ def test_downsample():
a = tcn.shared_constructor(my_rand(*shp), 'a') a = tcn.shared_constructor(my_rand(*shp), 'a')
f = pfunc([], ds_op(tensor.as_tensor_variable(a)), f = pfunc([], ds_op(tensor.as_tensor_variable(a)),
mode=mode_with_gpu.excluding('cudnn')) mode=mode_with_gpu.excluding('cudnn'))
f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)), f2 = pfunc([], ds_op(tensor.as_tensor_variable(a)),
mode=mode_without_gpu) mode=mode_without_gpu)
assert any([isinstance(node.op, assert any([isinstance(node.op,
tcn.blas.GpuDownsampleFactorMax) tcn.blas.GpuDownsampleFactorMax)
for node in f.maker.fgraph.toposort()]) for node in f.maker.fgraph.toposort()])
assert any([isinstance(node.op, Pool) assert any([isinstance(node.op, Pool)
for node in f2.maker.fgraph.toposort()]) for node in f2.maker.fgraph.toposort()])
assert numpy.allclose(f(), f2()) assert numpy.allclose(f(), f2())
# The grad is too slow on GT220 GPU # The grad is too slow on GT220 GPU
...@@ -387,15 +392,15 @@ def test_downsample(): ...@@ -387,15 +392,15 @@ def test_downsample():
continue continue
g = pfunc( g = pfunc(
[], [],
tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(),
a), a),
mode=mode_with_gpu.excluding('cudnn')) mode=mode_with_gpu.excluding('cudnn'))
g2 = pfunc( g2 = pfunc(
[], [],
tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(), tensor.grad(ds_op(tensor.as_tensor_variable(a)).sum(),
a), a),
mode=mode_without_gpu) mode=mode_without_gpu)
assert any([isinstance(node.op, assert any([isinstance(node.op,
tcn.blas.GpuDownsampleFactorMaxGrad) tcn.blas.GpuDownsampleFactorMaxGrad)
for node in g.maker.fgraph.toposort()]) for node in g.maker.fgraph.toposort()])
...@@ -413,11 +418,12 @@ def test_downsample(): ...@@ -413,11 +418,12 @@ def test_downsample():
gg = pfunc([], ggf, mode=gpu_mode) gg = pfunc([], ggf, mode=gpu_mode)
gg2 = pfunc([], ggf, mode=ref_mode) gg2 = pfunc([], ggf, mode=ref_mode)
assert any([isinstance(node.op, assert any([isinstance(
tcn.blas.GpuDownsampleFactorMaxGradGrad) node.op, tcn.blas.GpuDownsampleFactorMaxGradGrad)
for node in gg.maker.fgraph.toposort()]) for node in gg.maker.fgraph.toposort()])
assert any([isinstance(node.op, DownsampleFactorMaxGradGrad) assert any([isinstance(
for node in gg2.maker.fgraph.toposort()]) node.op, DownsampleFactorMaxGradGrad)
for node in gg2.maker.fgraph.toposort()])
assert numpy.allclose(gg(), gg2()), shp assert numpy.allclose(gg(), gg2()), shp
# We already check that the gpu version return # We already check that the gpu version return
...@@ -434,6 +440,7 @@ class TestGpuGemv(TestCase, BaseGemv, ...@@ -434,6 +440,7 @@ class TestGpuGemv(TestCase, BaseGemv,
gemv = gpu_gemv_no_inplace gemv = gpu_gemv_no_inplace
gemv_inplace = gpu_gemv_inplace gemv_inplace = gpu_gemv_inplace
# Mimic shared constructors registry # Mimic shared constructors registry
@staticmethod @staticmethod
def shared(val): def shared(val):
# If we don't put shared on the GPU, we won't be able to test # If we don't put shared on the GPU, we won't be able to test
...@@ -445,7 +452,7 @@ class TestGpuGemv(TestCase, BaseGemv, ...@@ -445,7 +452,7 @@ class TestGpuGemv(TestCase, BaseGemv,
class TestGpuGemvNoTransfer(TestCase, BaseGemv, class TestGpuGemvNoTransfer(TestCase, BaseGemv,
unittest_tools.TestOptimizationMixin): unittest_tools.TestOptimizationMixin):
mode = mode_with_gpu mode = mode_with_gpu
dtype = 'float32' dtype = 'float32'
...@@ -471,13 +478,13 @@ class TestVectorMatrixDot(TestCase): ...@@ -471,13 +478,13 @@ class TestVectorMatrixDot(TestCase):
''' Test vector dot matrix ''' ''' Test vector dot matrix '''
v = theano.shared(numpy.array(numpy.random.rand(2), dtype='float32')) v = theano.shared(numpy.array(numpy.random.rand(2), dtype='float32'))
m = theano.shared(numpy.array(numpy.random.rand(2, 5), m = theano.shared(numpy.array(numpy.random.rand(2, 5),
dtype='float32')) dtype='float32'))
no_gpu_f = theano.function([], theano.dot(v, m), mode=mode_without_gpu) no_gpu_f = theano.function([], theano.dot(v, m), mode=mode_without_gpu)
gpu_f = theano.function([], theano.dot(v, m), mode=mode_with_gpu) gpu_f = theano.function([], theano.dot(v, m), mode=mode_with_gpu)
# gpu_f2 is needed to test the case when the input is not on the gpu # gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu. # but the output is moved to the gpu.
gpu_f2 = theano.function([], tcn.gpu_from_host(theano.dot(v, m)), gpu_f2 = theano.function([], tcn.gpu_from_host(theano.dot(v, m)),
mode=mode_with_gpu) mode=mode_with_gpu)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
...@@ -490,9 +497,9 @@ class TestVectorMatrixDot(TestCase): ...@@ -490,9 +497,9 @@ class TestVectorMatrixDot(TestCase):
# Check double-strided m # Check double-strided m
m.set_value( m.set_value(
m.get_value(borrow=True, m.get_value(borrow=True,
return_internal_type=True)[::-1, ::-1], return_internal_type=True)[::-1, ::-1],
borrow=True) borrow=True)
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f2(), atol=self.atol)
...@@ -500,13 +507,13 @@ class TestVectorMatrixDot(TestCase): ...@@ -500,13 +507,13 @@ class TestVectorMatrixDot(TestCase):
''' Test matrix dot vector ''' ''' Test matrix dot vector '''
v = theano.shared(numpy.array(numpy.random.rand(2), dtype='float32')) v = theano.shared(numpy.array(numpy.random.rand(2), dtype='float32'))
m = theano.shared(numpy.array(numpy.random.rand(5, 2), m = theano.shared(numpy.array(numpy.random.rand(5, 2),
dtype='float32')) dtype='float32'))
no_gpu_f = theano.function([], theano.dot(m, v), mode=mode_without_gpu) no_gpu_f = theano.function([], theano.dot(m, v), mode=mode_without_gpu)
gpu_f = theano.function([], theano.dot(m, v), mode=mode_with_gpu) gpu_f = theano.function([], theano.dot(m, v), mode=mode_with_gpu)
# gpu_f2 is needed to test the case when the input is not on the gpu # gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu. # but the output is moved to the gpu.
gpu_f2 = theano.function([], tcn.gpu_from_host(theano.dot(m, v)), gpu_f2 = theano.function([], tcn.gpu_from_host(theano.dot(m, v)),
mode=mode_with_gpu) mode=mode_with_gpu)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
...@@ -520,19 +527,21 @@ class TestVectorMatrixDot(TestCase): ...@@ -520,19 +527,21 @@ class TestVectorMatrixDot(TestCase):
def test_gemv1(self): def test_gemv1(self):
''' test vector1+dot(matrix,vector2) ''' ''' test vector1+dot(matrix,vector2) '''
v1 = theano.tensor._shared(numpy.array(numpy.random.rand(2), v1 = theano.tensor._shared(numpy.array(numpy.random.rand(2),
dtype='float32')) dtype='float32'))
v2 = theano.tensor._shared(numpy.array(numpy.random.rand(5), v2 = theano.tensor._shared(numpy.array(numpy.random.rand(5),
dtype='float32')) dtype='float32'))
m = theano.tensor._shared(numpy.array(numpy.random.rand(5, 2), m = theano.tensor._shared(numpy.array(numpy.random.rand(5, 2),
dtype='float32')) dtype='float32'))
no_gpu_f = theano.function([], v2 + theano.dot(m, v1), no_gpu_f = theano.function([], v2 + theano.dot(m, v1),
mode=mode_without_gpu) mode=mode_without_gpu)
gpu_f = theano.function([], v2 + theano.dot(m, v1), mode=mode_with_gpu) gpu_f = theano.function([], v2 + theano.dot(m, v1), mode=mode_with_gpu)
# gpu_f2 is needed to test the case when the input is not on the gpu # gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu. # but the output is moved to the gpu.
gpu_f2 = theano.function([], tcn.gpu_from_host(v2 + theano.dot(m, v1)), gpu_f2 = theano.function(
mode=mode_with_gpu) [],
tcn.gpu_from_host(v2 + theano.dot(m, v1)),
mode=mode_with_gpu)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
...@@ -548,16 +557,17 @@ class TestVectorMatrixDot(TestCase): ...@@ -548,16 +557,17 @@ class TestVectorMatrixDot(TestCase):
v1 = theano.shared(numpy.array(numpy.random.rand(5), dtype='float32')) v1 = theano.shared(numpy.array(numpy.random.rand(5), dtype='float32'))
v2 = tensor._shared(numpy.array(numpy.random.rand(2), dtype='float32')) v2 = tensor._shared(numpy.array(numpy.random.rand(2), dtype='float32'))
m = theano.shared(numpy.array(numpy.random.rand(5, 2), m = theano.shared(numpy.array(numpy.random.rand(5, 2),
dtype='float32')) dtype='float32'))
no_gpu_f = theano.function([], v2 + theano.dot(v1, m), no_gpu_f = theano.function([], v2 + theano.dot(v1, m),
mode=mode_without_gpu) mode=mode_without_gpu)
gpu_f = theano.function([], v2 + theano.dot(v1, m), gpu_f = theano.function([], v2 + theano.dot(v1, m),
mode=mode_with_gpu) mode=mode_with_gpu)
# gpu_f2 is needed to test the case when the input is not on the gpu # gpu_f2 is needed to test the case when the input is not on the gpu
# but the output is moved to the gpu. # but the output is moved to the gpu.
gpu_f2 = theano.function([], tcn.gpu_from_host(v2 + theano.dot(v1, m)), gpu_f2 = theano.function(
mode=mode_with_gpu) [], tcn.gpu_from_host(v2 + theano.dot(v1, m)),
mode=mode_with_gpu)
# Assert they produce the same output # Assert they produce the same output
assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol) assert numpy.allclose(no_gpu_f(), gpu_f(), atol=self.atol)
......
...@@ -2,14 +2,16 @@ ...@@ -2,14 +2,16 @@
Tests for GPU convolution Tests for GPU convolution
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import sys
import time import time
import unittest import unittest
import traceback import theano
from theano import tensor
from theano.tests.unittest_tools import seed_rng, assert_allclose
from theano.sandbox import cuda
import numpy import numpy
from six.moves import xrange from six.moves import xrange
from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
from nose.tools import assert_raises from nose.tools import assert_raises
imported_scipy_convolve2d = False imported_scipy_convolve2d = False
...@@ -19,16 +21,10 @@ try: ...@@ -19,16 +21,10 @@ try:
except ImportError: except ImportError:
pass pass
import theano
from theano import tensor
from theano.tests.unittest_tools import seed_rng, assert_allclose
# Skip test if cuda is not available. # Skip test if cuda is not available.
from theano.sandbox import cuda if cuda.cuda_available is False:
if cuda.cuda_available == False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
from theano.sandbox.cuda.dnn import GpuDnnConv, DnnBase, dnn_conv
# needed as the gpu conv don't have a perform implementation. # needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
...@@ -56,8 +52,8 @@ device_prop = cuda_ndarray.device_properties(device_id) ...@@ -56,8 +52,8 @@ device_prop = cuda_ndarray.device_properties(device_id)
def py_conv_valid_numpy(img, kern): def py_conv_valid_numpy(img, kern):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
outshp = (img.shape[0], kern.shape[0], outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1, img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1) img.shape[3] - kern.shape[3] + 1)
out = numpy.zeros(outshp, dtype='float32') out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]): for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]): for k in xrange(out.shape[1]):
...@@ -106,11 +102,11 @@ def py_conv(img, kern, mode, subsample): ...@@ -106,11 +102,11 @@ def py_conv(img, kern, mode, subsample):
if imported_scipy_convolve2d: if imported_scipy_convolve2d:
return py_conv_scipy(img, kern, mode, subsample) return py_conv_scipy(img, kern, mode, subsample)
elif mode == 'valid': elif mode == 'valid':
return py_conv_valid_numpy(img, kern)[:, :, ::subsample[0], return py_conv_valid_numpy(img, kern)[
::subsample[1]] :, :, ::subsample[0], ::subsample[1]]
elif mode == 'full': elif mode == 'full':
return py_conv_full_numpy(img, kern)[:, :, ::subsample[0], return py_conv_full_numpy(img, kern)[
::subsample[1]] :, :, ::subsample[0], ::subsample[1]]
else: else:
raise Exception("Can't execute this kernel.") raise Exception("Can't execute this kernel.")
...@@ -119,20 +115,20 @@ def py_conv_scipy(img, kern, mode, subsample): ...@@ -119,20 +115,20 @@ def py_conv_scipy(img, kern, mode, subsample):
assert img.shape[1] == kern.shape[1] assert img.shape[1] == kern.shape[1]
if mode == 'valid': if mode == 'valid':
outshp = (img.shape[0], kern.shape[0], outshp = (img.shape[0], kern.shape[0],
img.shape[2] - kern.shape[2] + 1, img.shape[2] - kern.shape[2] + 1,
img.shape[3] - kern.shape[3] + 1) img.shape[3] - kern.shape[3] + 1)
else: else:
outshp = (img.shape[0], kern.shape[0], outshp = (img.shape[0], kern.shape[0],
img.shape[2] + kern.shape[2] - 1, img.shape[2] + kern.shape[2] - 1,
img.shape[3] + kern.shape[3] - 1) img.shape[3] + kern.shape[3] - 1)
out = numpy.zeros(outshp, dtype='float32') out = numpy.zeros(outshp, dtype='float32')
for b in xrange(out.shape[0]): for b in xrange(out.shape[0]):
for k in xrange(out.shape[1]): for k in xrange(out.shape[1]):
for s in xrange(img.shape[1]): for s in xrange(img.shape[1]):
#convolve2d or correlate # convolve2d or correlate
out[b, k, :, :] += convolve2d(img[b, s, :, :], out[b, k, :, :] += convolve2d(img[b, s, :, :],
kern[k, s, :, :], kern[k, s, :, :],
mode) mode)
return out[:, :, ::subsample[0], ::subsample[1]] return out[:, :, ::subsample[0], ::subsample[1]]
...@@ -168,10 +164,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -168,10 +164,12 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
dtype='float32') dtype='float32')
else: else:
npy_img = theano._asarray(numpy.arange( npy_img = theano._asarray(
numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 numpy.arange(numpy.prod(ishape)).reshape(ishape),
npy_kern = -(theano._asarray(numpy.arange( dtype='float32') + 1
numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) npy_kern = -(theano._asarray(
numpy.arange(numpy.prod(kshape)).reshape(kshape),
dtype='float32') + 1)
img = cuda_ndarray.CudaNdarray(npy_img) img = cuda_ndarray.CudaNdarray(npy_img)
kern = cuda_ndarray.CudaNdarray(npy_kern) kern = cuda_ndarray.CudaNdarray(npy_kern)
...@@ -239,7 +237,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), ...@@ -239,7 +237,7 @@ def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1),
div = float('inf') div = float('inf')
print('%15s' % str(ishape), '%15s' % str(kshape), end=' ') print('%15s' % str(ishape), '%15s' % str(kshape), end=' ')
print('%12.5f %7.2f %7.2f %7.1f' % ( print('%12.5f %7.2f %7.2f %7.1f' % (
approx_fp, cpu_mflops, gpu_mflops, div)) approx_fp, cpu_mflops, gpu_mflops, div))
def exec_conv(version, shapes, verbose, random, mode, def exec_conv(version, shapes, verbose, random, mode,
...@@ -261,7 +259,7 @@ def get_basic_shapes(): ...@@ -261,7 +259,7 @@ def get_basic_shapes():
return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)), return [((1, 1, 1, 1), (1, 1, 1, 1), (1, 1), (1, 1), (1, 1)),
((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)), ((1, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)), ((1, 1, 3, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
# basic test for unsquare kernel and image # basic test for unsquare kernel and image
((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)), ((1, 1, 2, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)), ((1, 1, 3, 4), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)), ((1, 1, 4, 3), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
...@@ -281,17 +279,17 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1), ...@@ -281,17 +279,17 @@ def get_shapes(imshp=(1, 1), kshp=(1, 1), subsample=(1, 1),
((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride), ((3, 1) + imshp, (1, 1) + kshp, subsample, img_stride, kern_stride),
# nkern only # nkern only
((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride), ((1, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and nkern # batch and nkern
((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride), ((3, 1) + imshp, (2, 1) + kshp, subsample, img_stride, kern_stride),
#batch and stack # batch and stack
((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride), ((3, 2) + imshp, (1, 2) + kshp, subsample, img_stride, kern_stride),
#stack and nkern # stack and nkern
((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride), ((1, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack # batch, nkern and stack
((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride), ((2, 2) + imshp, (2, 2) + kshp, subsample, img_stride, kern_stride),
#batch, nkern and stack # batch, nkern and stack
((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride) ((3, 2) + imshp, (4, 2) + kshp, subsample, img_stride, kern_stride)
] ]
def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1), def get_shapes2(scales_img=(1, 1), scales_kern=(1, 1), subsample=(1, 1),
...@@ -344,39 +342,39 @@ def get_valid_shapes(): ...@@ -344,39 +342,39 @@ def get_valid_shapes():
# test subsample done in a separate fct # test subsample done in a separate fct
shapes += [ shapes += [
# other test # other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize,
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image,
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern,
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim,
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1 ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)), # layer 1 backprop to weights
, ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)) # added a test case that fail from test_nnet.py.test_conv_nnet2 ((60, 20, 28, 28), (10, 20, 5, 5), (1, 1), (2, 2), (1, 1)), # added a test case that fail from test_nnet.py.test_conv_nnet2
, ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)) # test precedent but reduced that triger the error ((10, 5, 28, 28), (10, 5, 5, 5), (1, 1), (2, 2), (1, 1)), # test precedent but reduced that triger the error
# Test more than maxThreadsDim0 # Test more than maxThreadsDim0
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1))
] ]
shapes += [ ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 1 layers shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 1 layers
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 2 layers ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
, ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
, ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 2 valid ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid # ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 1 layers ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 2 layers ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1 # ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)), # test_lenet_64 bprop 1
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2 # ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1)) # test_lenet_64 bprop 2
] ]
return shapes return shapes
...@@ -466,48 +464,47 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[], ...@@ -466,48 +464,47 @@ def _test_full(cls, mode=None, version=[-1], extra_shapes=[],
shapes += [ shapes += [
# other test # other test
((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((2, 1, 2, 2), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 2), (1, 1), (1, 1), (1, 1)),
, ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((1, 1, 4, 4), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 3), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 10, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)) ((4, 1, 20, 10), (1, 1, 2, 10), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ((3, 2, 8, 8), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize
, ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image ((3, 2, 8, 6), (4, 2, 4, 4), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image
, ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize, non-square image, non-square kern ((3, 2, 8, 6), (4, 2, 4, 3), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize, non-square image, non-square kern
, ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)) # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim ((3, 2, 8, 6), (4, 2, 4, 6), (1, 1), (1, 1), (1, 1)), # stack, nkern, bsize ,non-square image, non-square kern, kernsize==imgsize on one dim
, ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)) # a big one ((16, 5, 64, 64), (8, 5, 8, 8), (1, 1), (1, 1), (1, 1)), # a big one
, ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # MNIST LeNET layer 1 ((16, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # MNIST LeNET layer 1
, ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights ((20, 16, 32, 32), (1, 16, 28, 28), (1, 1), (1, 1), (1, 1)) # layer 1 backprop to weights
] ]
if test_bigger_kernels: if test_bigger_kernels:
# Shapes where the kernel is larger than the image in some dimension # Shapes where the kernel is larger than the image in some dimension
shapes += [ shapes += [
((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)) ((3, 1, 1, 1), (2, 1, 5, 3), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)) ((3, 2, 1, 1), (4, 2, 1, 1), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 2, 6), (1, 1), (1, 1), (1, 1)),
, ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)) ((3, 2, 4, 4), (4, 2, 8, 6), (1, 1), (1, 1), (1, 1)),
, ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1)) ((4, 2, 10, 10), (3, 2, 2, 12), (1, 1), (1, 1), (1, 1))
] ]
shapes += [ shapes += [((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 1 layers
# ((60,1,28,28),(20,1,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 1 layers # ((60, 20, 12, 12),(30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 2 layers
# , ((60,20,12,12),(30,20,5,5), (1, 1), (1, 1), (1, 1))#test_lenet_28 2 layers ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 1 full
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # test_lenet_28 bprop 1 full # ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((20,60,12,12),(30,60,8,8), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid # ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1)), # test_lenet_28 bprop 2 valid
# , ((1,60,28,28),(20,60,24,24), (1, 1), (1, 1), (1, 1))#test_lenet_28 bprop 2 valid # ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 1 layers
# , ((10,1,64,64),(20,1,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 1 layers # ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 2 layers
# , ((10,20,29,29),(30,20,7,7), (1, 1), (1, 1), (1, 1))#test_lenet_64 2 layers ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # test_lenet_64 full
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # test_lenet_64 full # ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1)), # test_lenet_64 bprop 1
# , ((20,10,29,29),(30,10,23,23), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 1 # ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1)), # test_lenet_64 bprop 2
# , ((1,10,64,64),(20,10,58,58), (1, 1), (1, 1), (1, 1))#test_lenet_64 bprop 2 # Test more than maxThreadsDim0
# Test more than maxThreadsDim0 ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
, ((2, 4, 13, 1050), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)),
, ((2, 4, 1050, 13), (3, 4, 10, 11), (1, 1), (1, 1), (1, 1)) ((1, 1, 44800, 1), (6, 1, 1, 1), (1, 1), (1, 1), (1, 1)) # This caused crash
, ((1, 1, 44800, 1), (6, 1, 1, 1), (1, 1), (1, 1), (1, 1)) # This caused crash ]
]
verbose = 0 verbose = 0
random = True random = True
...@@ -561,7 +558,7 @@ def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]): ...@@ -561,7 +558,7 @@ def _test_subsample(cls, mode, version_valid=[-1], version_full=[-1]):
((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)), ((4, 2, 10, 10), (3, 2, 2, 2), (1, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)), ((4, 2, 10, 10), (3, 2, 2, 2), (3, 3), (1, 1), (1, 1)),
((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1)) ((4, 2, 10, 10), (3, 2, 2, 2), (3, 1), (1, 1), (1, 1))
] ]
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 1))
shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2)) shapes += get_shapes2(scales_img=(2, 2), subsample=(1, 2))
shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1)) shapes += get_shapes2(scales_img=(2, 2), subsample=(2, 1))
...@@ -636,7 +633,6 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -636,7 +633,6 @@ class TestConv2DGPU(unittest.TestCase):
imshp_logical=featshp_logical[1:], imshp_logical=featshp_logical[1:],
kshp_logical=kshp[2:]) kshp_logical=kshp[2:])
def test_invalid_input_shape(self): def test_invalid_input_shape(self):
""" """
Tests that when the shape gived at build time is not the same as Tests that when the shape gived at build time is not the same as
...@@ -659,7 +655,7 @@ class TestConv2DGPU(unittest.TestCase): ...@@ -659,7 +655,7 @@ class TestConv2DGPU(unittest.TestCase):
for mode in ['valid', 'full']: for mode in ['valid', 'full']:
for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)), for shapes in [((3, 2, 8, 8), (4, 2, 5, 5), (8, 8)),
((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)), ((3, 2, 8, 8), (4, 2, 5, 5), (5, 8)),
#((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)), # ((3, 2, 8, 8), (4, 2, 5, 5), (8, 5)),
# We use only the number of columns. # We use only the number of columns.
]: ]:
...@@ -700,11 +696,11 @@ class TestConvWithPadding(object): ...@@ -700,11 +696,11 @@ class TestConvWithPadding(object):
kern = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32') kern = theano._asarray(numpy.empty((1, 1, 1, 1)), dtype='float32')
for i in self.conv_ops: for i in self.conv_ops:
assert_raises(ValueError, i, img, kern, assert_raises(ValueError, i, img, kern,
border_mode=(-1, 0)) border_mode=(-1, 0))
assert_raises(ValueError, i, img, kern, assert_raises(ValueError, i, img, kern,
border_mode=(0, -1)) border_mode=(0, -1))
assert_raises(ValueError, i, img, kern, assert_raises(ValueError, i, img, kern,
border_mode='not border') border_mode='not border')
def _run_onecase(self, img_shape, kern_shape, padding, op): def _run_onecase(self, img_shape, kern_shape, padding, op):
npy_img = numpy.random.rand(*img_shape).astype('float32') npy_img = numpy.random.rand(*img_shape).astype('float32')
...@@ -776,9 +772,9 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, ...@@ -776,9 +772,9 @@ def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
border_mode='valid', subsample=subsample)(i, k) border_mode='valid', subsample=subsample)(i, k)
f = theano.function([i, k], op, mode=theano_mode) f = theano.function([i, k], op, mode=theano_mode)
gpuval = numpy.array(f( gpuval = numpy.array(f(
npy_img.transpose(1, 0, 2, 3), npy_img.transpose(1, 0, 2, 3),
npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])).transpose( npy_kern.transpose(1, 0, 2, 3)[:, :, ::-1, ::-1])
1, 0, 2, 3) ).transpose(1, 0, 2, 3)
assert_allclose(cpuval, gpuval, rtol=1e-4) assert_allclose(cpuval, gpuval, rtol=1e-4)
...@@ -892,44 +888,44 @@ def benchmark(): ...@@ -892,44 +888,44 @@ def benchmark():
shapes_valid = [ shapes_valid = [
# test_lenet_28 shape # test_lenet_28 shape
((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)) # valid ((20, 60, 12, 12), (30, 60, 8, 8), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 20, 12, 12), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 1, 28, 28), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)) # valid ((1, 60, 28, 28), (20, 60, 24, 24), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_32 shape # test_lenet_32 shape
, ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)) # valid ((20, 60, 14, 14), (30, 60, 10, 10), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 20, 14, 14), (30, 20, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)) # valid ((60, 1, 32, 32), (20, 1, 5, 5), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)) # valid ((1, 60, 32, 32), (20, 60, 28, 28), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_64 shape # test_lenet_64 shape
, ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 20, 29, 29), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)) # valid ((20, 10, 29, 29), (30, 10, 23, 23), (1, 1), (1, 1), (1, 1)), # valid
, ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 1, 64, 64), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)) # valid ((1, 10, 64, 64), (20, 10, 58, 58), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_108 shape # test_lenet_108 shape
, ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 20, 51, 51), (30, 20, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)) # valid ((20, 10, 51, 51), (30, 10, 45, 45), (1, 1), (1, 1), (1, 1)), # valid
, ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)) # valid ((10, 1, 108, 108), (20, 1, 7, 7), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)) # valid ((1, 10, 108, 108), (20, 10, 102, 102), (1, 1), (1, 1), (1, 1)), # valid
# test_lenet_256 shape # test_lenet_256 shape
, ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)) # valid ((2, 20, 124, 124), (30, 20, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
, ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)) # valid ((20, 2, 124, 124), (30, 2, 116, 116), (1, 1), (1, 1), (1, 1)), # valid
, ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)) # valid ((2, 1, 256, 256), (20, 1, 9, 9), (1, 1), (1, 1), (1, 1)), # valid
, ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)) # valid ((1, 2, 256, 256), (20, 2, 248, 248), (1, 1), (1, 1), (1, 1)) # valid
] ]
shapes_full = [ shapes_full = [
# test_lenet_28 shape # test_lenet_28 shape
((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full ((60, 30, 8, 8), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full
# test_lenet_32 shape # test_lenet_32 shape
, ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=1 ((60, 30, 10, 10), (20, 30, 5, 5), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=1
# test_lenet_64 shape # test_lenet_64 shape
, ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full conv_full_patch_stack_padded' N=3 ((10, 30, 23, 23), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full conv_full_patch_stack_padded' N=3
# test_lenet_108 shape # test_lenet_108 shape
, ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)) # full 'conv_full_patch_stack_padded' N=9 ((10, 30, 45, 45), (20, 30, 7, 7), (1, 1), (1, 1), (1, 1)), # full 'conv_full_patch_stack_padded' N=9
# test_lenet_256 shape # test_lenet_256 shape
, ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)) # full conv_reference_full ((2, 30, 116, 116), (20, 30, 9, 9), (1, 1), (1, 1), (1, 1)) # full conv_reference_full
] ]
version = [-1] version = [-1]
verbose = 1 verbose = 1
...@@ -952,6 +948,6 @@ def test_stack_rows_segfault_070312(): ...@@ -952,6 +948,6 @@ def test_stack_rows_segfault_070312():
kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32')) kern = theano.shared(numpy.random.rand(1, 80, 9, 9).astype('float32'))
out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32')) out = theano.shared(numpy.random.rand(1, 2, 2, 3).astype('float32'))
op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9), op = theano.tensor.nnet.conv.ConvOp(imshp=(80, 96, 96), kshp=(9, 9),
nkern=1, bsize=1) nkern=1, bsize=1)
f = theano.function([], [], updates=[(out, op(img, kern))], mode=theano_mode) f = theano.function([], [], updates=[(out, op(img, kern))], mode=theano_mode)
f() f()
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import time, copy, sys, unittest import copy
import unittest
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
...@@ -32,7 +33,7 @@ def advantage(cpu_dt, gpu_dt): ...@@ -32,7 +33,7 @@ def advantage(cpu_dt, gpu_dt):
def test_host_to_device(): def test_host_to_device():
#print >>sys.stdout, 'starting test_host_to_dev' # print >>sys.stdout, 'starting test_host_to_dev'
for shape in ((), (3,), (2, 3), (3, 4, 5, 6)): for shape in ((), (3,), (2, 3), (3, 4, 5, 6)):
a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
b = cuda_ndarray.CudaNdarray(a) b = cuda_ndarray.CudaNdarray(a)
...@@ -52,30 +53,29 @@ def test_host_to_device(): ...@@ -52,30 +53,29 @@ def test_host_to_device():
def test_add_iadd_idiv(): def test_add_iadd_idiv():
for shapes in ( for shapes in ([(5, 5), (5, 1)],
[(5, 5), (5, 1)], [(5, 5), (1, 5)],
[(5, 5), (1, 5)], (), (0,), (3,), (2, 3),
(), (0,), (3,), (2, 3), (1, 10000000), (10000, 1000), (1000000, 10),
(1, 10000000), (10000, 1000), (1000000, 10), (4100, 33, 34), (33, 4100, 34), (33, 34, 4100),
(4100, 33, 34), (33, 4100, 34), (33, 34, 4100), (4100, 33, 3, 6), (33, 4100, 3, 6), (33, 3, 4100, 6), (33, 3, 6, 4100),
(4100, 33, 3, 6), (33, 4100, 3, 6), (33, 3, 4100, 6), (33, 3, 6, 4100), (4100, 3, 34, 6), (3, 4100, 34, 6), (3, 34, 4100, 6), (3, 34, 6, 4100),
(4100, 3, 34, 6), (3, 4100, 34, 6), (3, 34, 4100, 6), (3, 34, 6, 4100), (4100, 3, 4, 36), (3, 4100, 4, 36), (3, 4, 4100, 36), (3, 4, 36, 4100),
(4100, 3, 4, 36), (3, 4100, 4, 36), (3, 4, 4100, 36), (3, 4, 36, 4100), (0, 0, 0, 0, 0),
(0, 0, 0, 0, 0), (3, 34, 35, 36, 37),
(3, 34, 35, 36, 37), (33, 34, 3, 36, 37),
(33, 34, 3, 36, 37), (33, 34, 35, 36, 3),
(33, 34, 35, 36, 3), (0, 0, 0, 0, 0, 0),
(0, 0, 0, 0, 0, 0), (3, 34, 35, 36, 37, 2),
(3, 34, 35, 36, 37, 2), (33, 34, 3, 36, 37, 2),
(33, 34, 3, 36, 37, 2), (33, 34, 35, 36, 3, 2),
(33, 34, 35, 36, 3, 2), (3, 4, 5, 6, 7, 1025),
(3, 4, 5, 6, 7, 1025), (3, 4, 5, 6, 1025, 7),
(3, 4, 5, 6, 1025, 7), (3, 4, 5, 1025, 6, 7),
(3, 4, 5, 1025, 6, 7), (3, 4, 1025, 5, 6, 7),
(3, 4, 1025, 5, 6, 7), (3, 1025, 4, 5, 6, 7),
(3, 1025, 4, 5, 6, 7), (1025, 3, 4, 5, 6, 7),
(1025, 3, 4, 5, 6, 7), ):
):
if isinstance(shapes, tuple): if isinstance(shapes, tuple):
shape = shapes shape = shapes
shape2 = shapes shape2 = shapes
...@@ -98,18 +98,12 @@ def test_add_iadd_idiv(): ...@@ -98,18 +98,12 @@ def test_add_iadd_idiv():
# add don't support stride # add don't support stride
if shape == shape2: if shape == shape2:
t0 = time.time()
bsum = b0 + b1 bsum = b0 + b1
bsum = b0 + b1 bsum = b0 + b1
t1 = time.time()
gpu_dt = t1 - t0
t0 = time.time()
asum = a0 + a1 asum = a0 + a1
asum = a0 + a1 asum = a0 + a1
t1 = time.time()
cpu_dt = t1 - t0
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt) # print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
assert numpy.allclose(asum, numpy.asarray(bsum)) assert numpy.allclose(asum, numpy.asarray(bsum))
# test not contiguous version. # test not contiguous version.
# should raise not implemented. # should raise not implemented.
...@@ -133,23 +127,9 @@ def test_add_iadd_idiv(): ...@@ -133,23 +127,9 @@ def test_add_iadd_idiv():
raise Exception("You need to modify this case!") raise Exception("You need to modify this case!")
# TODO: b0[...,::-1] don't work # TODO: b0[...,::-1] don't work
if shape == shape2:
t = False
try:
_c = _b+b1
except TypeError:
t = True
assert t
# test inplace version # test inplace version
t0 = time.time()
b0 += b1 b0 += b1
t1 = time.time()
gpu_dt = t1 - t0
t0 = time.time()
a0 += a1 a0 += a1
t1 = time.time()
cpu_dt = t1 - t0
# print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt) # print shape, 'adding inplace', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0, a0_orig + a1) assert numpy.allclose(a0, a0_orig + a1)
...@@ -157,14 +137,14 @@ def test_add_iadd_idiv(): ...@@ -157,14 +137,14 @@ def test_add_iadd_idiv():
b0 /= b1 b0 /= b1
a0 /= a1 a0 /= a1
assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0, (a0_orig + a1)/a1) assert numpy.allclose(a0, (a0_orig + a1) / a1)
# test inplace version # test inplace version
# for not contiguous input # for not contiguous input
b0 += _b b0 += _b
a0 += a1[..., ::-1] a0 += a1[..., ::-1]
assert numpy.allclose(a0, numpy.asarray(b0)) assert numpy.allclose(a0, numpy.asarray(b0))
assert numpy.allclose(a0, (a0_orig+a1)/a1+a1[..., ::-1]) assert numpy.allclose(a0, (a0_orig + a1) / a1 + a1[..., ::-1])
b0 /= _b b0 /= _b
a0 /= a1[..., ::-1] a0 /= a1[..., ::-1]
...@@ -174,48 +154,42 @@ def test_add_iadd_idiv(): ...@@ -174,48 +154,42 @@ def test_add_iadd_idiv():
def test_exp(): def test_exp():
#print >>sys.stdout, 'starting test_exp' # print >>sys.stdout, 'starting test_exp'
for shape in ((), (3,), (2, 3), for shape in ((), (3,), (2, 3),
(1, 10000000), (10, 1000000), (1, 10000000), (10, 1000000),
(100, 100000), (1000, 10000), (10000, 1000)): (100, 100000), (1000, 10000), (10000, 1000)):
a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32') a0 = theano._asarray(numpy.random.rand(*shape), dtype='float32')
a1 = a0.copy() a1 = a0.copy()
b0 = cuda_ndarray.CudaNdarray(a0) b0 = cuda_ndarray.CudaNdarray(a0)
b1 = cuda_ndarray.CudaNdarray(a1) cuda_ndarray.CudaNdarray(a1)
t0 = time.time()
bsum = b0.exp() bsum = b0.exp()
t1 = time.time()
gpu_dt = t1 - t0
t0 = time.time()
asum = numpy.exp(a1) asum = numpy.exp(a1)
t1 = time.time()
cpu_dt = t1 - t0
# print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt) # print shape, 'adding ', a0.size, 'cpu', cpu_dt, 'advantage', advantage(cpu_dt, gpu_dt)
#c = numpy.asarray(b0+b1) # c = numpy.asarray(b0+b1)
if asum.shape: if asum.shape:
assert numpy.allclose(asum, numpy.asarray(bsum)) assert numpy.allclose(asum, numpy.asarray(bsum))
def test_copy(): def test_copy():
#print >>sys.stdout, 'starting test_copy' # print >>sys.stdout, 'starting test_copy'
shape = (500, 499) shape = (500, 499)
a = theano._asarray(numpy.random.rand(*shape), dtype='float32') a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
#print >>sys.stdout, '.. creating device object' # print >>sys.stdout, '.. creating device object'
b = cuda_ndarray.CudaNdarray(a) b = cuda_ndarray.CudaNdarray(a)
#print >>sys.stdout, '.. copy' # print >>sys.stdout, '.. copy'
c = copy.copy(b) c = copy.copy(b)
#print >>sys.stdout, '.. deepcopy' # print >>sys.stdout, '.. deepcopy'
d = copy.deepcopy(b) d = copy.deepcopy(b)
#print >>sys.stdout, '.. comparisons' # print >>sys.stdout, '.. comparisons'
assert numpy.allclose(a, numpy.asarray(b)) assert numpy.allclose(a, numpy.asarray(b))
assert numpy.allclose(a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d)) assert numpy.allclose(a, numpy.asarray(d))
b += b b += b
assert numpy.allclose(a+a, numpy.asarray(b)) assert numpy.allclose(a + a, numpy.asarray(b))
assert numpy.allclose(a+a, numpy.asarray(c)) assert numpy.allclose(a + a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d)) assert numpy.allclose(a, numpy.asarray(d))
...@@ -237,8 +211,8 @@ def test_nvcc_bug(): ...@@ -237,8 +211,8 @@ def test_nvcc_bug():
assert numpy.allclose(a, numpy.asarray(c)) assert numpy.allclose(a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d)) assert numpy.allclose(a, numpy.asarray(d))
b += b b += b
assert numpy.allclose(a+a, numpy.asarray(b)) assert numpy.allclose(a + a, numpy.asarray(b))
assert numpy.allclose(a+a, numpy.asarray(c)) assert numpy.allclose(a + a, numpy.asarray(c))
assert numpy.allclose(a, numpy.asarray(d)) assert numpy.allclose(a, numpy.asarray(d))
...@@ -318,7 +292,7 @@ class test_DimShuffle(unittest.TestCase): ...@@ -318,7 +292,7 @@ class test_DimShuffle(unittest.TestCase):
def test_dot(): def test_dot():
#print >>sys.stdout, 'starting test_dot' # print >>sys.stdout, 'starting test_dot'
utt.seed_rng() utt.seed_rng()
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
...@@ -336,7 +310,7 @@ def test_dot(): ...@@ -336,7 +310,7 @@ def test_dot():
numpy_version = numpy.dot(a0, a1.T) numpy_version = numpy.dot(a0, a1.T)
transposed = cuda_ndarray.dimshuffle(b1, (1, 0)) transposed = cuda_ndarray.dimshuffle(b1, (1, 0))
cuda_version = cuda_ndarray.dot(b0, transposed) cuda_version = cuda_ndarray.dot(b0, transposed)
assert _allclose(numpy_version, cuda_version) assert _allclose(numpy_version, cuda_version)
...@@ -347,14 +321,16 @@ def test_dot(): ...@@ -347,14 +321,16 @@ def test_dot():
b0 = cuda_ndarray.CudaNdarray(a0) b0 = cuda_ndarray.CudaNdarray(a0)
assert _allclose(numpy.dot(a0.T, a1), assert _allclose(numpy.dot(a0.T, a1),
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), b1)) cuda_ndarray.dot(
cuda_ndarray.dimshuffle(b0, (1, 0)), b1))
a1 = theano._asarray(rng.randn(6, 7), dtype='float32') a1 = theano._asarray(rng.randn(6, 7), dtype='float32')
b1 = cuda_ndarray.CudaNdarray(a1) b1 = cuda_ndarray.CudaNdarray(a1)
assert _allclose(numpy.dot(a0.T, a1.T), assert _allclose(
cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)), numpy.dot(a0.T, a1.T),
cuda_ndarray.dimshuffle(b1, (1, 0)))) cuda_ndarray.dot(cuda_ndarray.dimshuffle(b0, (1, 0)),
cuda_ndarray.dimshuffle(b1, (1, 0))))
def test_sum(): def test_sum():
...@@ -367,8 +343,8 @@ def test_sum(): ...@@ -367,8 +343,8 @@ def test_sum():
assert numpy.allclose(a0.sum(), assert numpy.allclose(a0.sum(),
numpy.asarray(b0.reduce_sum([1, 1]))) numpy.asarray(b0.reduce_sum([1, 1])))
a0sum = a0.sum(axis=0) a0.sum(axis=0)
b0sum = b0.reduce_sum([1, 0]) b0.reduce_sum([1, 0])
# print 'asum\n',a0sum # print 'asum\n',a0sum
# print 'bsum\n',numpy.asarray(b0sum) # print 'bsum\n',numpy.asarray(b0sum)
...@@ -399,31 +375,30 @@ def test_sum(): ...@@ -399,31 +375,30 @@ def test_sum():
def test_reshape(): def test_reshape():
shapelist = [ shapelist = [((1, 2, 3), (1, 2, 3)),
((1, 2, 3), (1, 2, 3)), ((1,), (1,)),
((1,), (1,)), ((1, 2, 3), (3, 2, 1)),
((1, 2, 3), (3, 2, 1)), ((1, 2, 3), (6,)),
((1, 2, 3), (6,)), ((1, 2, 3, 2), (6, 2)),
((1, 2, 3, 2), (6, 2)), ((2, 3, 2), (6, 2)),
((2, 3, 2), (6, 2)), ((2, 3, 2), (12,))
((2, 3, 2), (12,)) ]
]
bad_shapelist = [ bad_shapelist = [
((1, 2, 3), (1, 2, 4)), ((1, 2, 3), (1, 2, 4)),
((1,), (2,)), ((1,), (2,)),
((1, 2, 3), (2, 2, 1)), ((1, 2, 3), (2, 2, 1)),
((1, 2, 3), (5,)), ((1, 2, 3), (5,)),
((1, 2, 3, 2), (6, 3)), ((1, 2, 3, 2), (6, 3)),
((2, 3, 2), (5, 2)), ((2, 3, 2), (5, 2)),
((2, 3, 2), (11,)) ((2, 3, 2), (11,))
] ]
utt.seed_rng() utt.seed_rng()
rng = numpy.random.RandomState(utt.fetch_seed()) rng = numpy.random.RandomState(utt.fetch_seed())
def subtest(shape_1, shape_2, rng): def subtest(shape_1, shape_2, rng):
#print >> sys.stdout, "INFO: shapes", shape_1, shape_2 # print >> sys.stdout, "INFO: shapes", shape_1, shape_2
a = theano._asarray(rng.randn(*shape_1), dtype='float32') a = theano._asarray(rng.randn(*shape_1), dtype='float32')
b = cuda_ndarray.CudaNdarray(a) b = cuda_ndarray.CudaNdarray(a)
...@@ -459,8 +434,8 @@ def test_reshape(): ...@@ -459,8 +434,8 @@ def test_reshape():
b = cuda_ndarray.CudaNdarray(a) b = cuda_ndarray.CudaNdarray(a)
try: try:
bb = b.reshape(shape_2) b.reshape(shape_2)
except Exception as ValueError: except Exception:
return return
assert False assert False
...@@ -477,13 +452,13 @@ def test_reshape(): ...@@ -477,13 +452,13 @@ def test_reshape():
def test_getshape(): def test_getshape():
shapelist = [ shapelist = [
((1, 2, 3), (1, 2, 3)), ((1, 2, 3), (1, 2, 3)),
((1,), (1,)), ((1,), (1,)),
((1, 2, 3), (3, 2, 1)), ((1, 2, 3), (3, 2, 1)),
((1, 2, 3), (6,)), ((1, 2, 3), (6,)),
((1, 2, 3, 2), (6, 2)), ((1, 2, 3, 2), (6, 2)),
((2, 3, 2), (6, 2)) ((2, 3, 2), (6, 2))
] ]
def subtest(shape): def subtest(shape):
a = theano._asarray(numpy.random.rand(*shape_1), dtype='float32') a = theano._asarray(numpy.random.rand(*shape_1), dtype='float32')
...@@ -509,7 +484,7 @@ def test_stride_manipulation(): ...@@ -509,7 +484,7 @@ def test_stride_manipulation():
b_strides = b._strides b_strides = b._strides
for i in xrange(len(b.shape)): for i in xrange(len(b.shape)):
offset += (b.shape[i]-1) * b_strides[i] offset += (b.shape[i] - 1) * b_strides[i]
v._set_stride(i, -b_strides[i]) v._set_stride(i, -b_strides[i])
v._dev_data += offset * sizeof_float v._dev_data += offset * sizeof_float
...@@ -699,8 +674,8 @@ def test_setitem_matrixvector1(): ...@@ -699,8 +674,8 @@ def test_setitem_matrixvector1():
assert numpy.allclose(a, numpy.asarray(_a)) assert numpy.allclose(a, numpy.asarray(_a))
# test direct transfert from numpy # test direct transfert from numpy
_a[:, 1] = b*100 _a[:, 1] = b * 100
a[:, 1] = b*100 a[:, 1] = b * 100
assert numpy.allclose(a, numpy.asarray(_a)) assert numpy.allclose(a, numpy.asarray(_a))
row = theano._asarray([777, 888, 999], dtype='float32') row = theano._asarray([777, 888, 999], dtype='float32')
...@@ -725,8 +700,8 @@ def test_setitem_matrix_tensor3(): ...@@ -725,8 +700,8 @@ def test_setitem_matrix_tensor3():
assert numpy.allclose(a, numpy.asarray(_a)) assert numpy.allclose(a, numpy.asarray(_a))
# test direct transfert from numpy # test direct transfert from numpy
_a[:, 1, 1] = b*100 _a[:, 1, 1] = b * 100
a[:, 1, 1] = b*100 a[:, 1, 1] = b * 100
assert numpy.allclose(a, numpy.asarray(_a)) assert numpy.allclose(a, numpy.asarray(_a))
row = theano._asarray([777, 888, 999], dtype='float32') row = theano._asarray([777, 888, 999], dtype='float32')
...@@ -752,7 +727,7 @@ def test_setitem_matrix_bad_shape(): ...@@ -752,7 +727,7 @@ def test_setitem_matrix_bad_shape():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:, 1, 1] = _b _a[:, 1, 1] = _b
assert False assert False
except ValueError as e: except ValueError:
# print e # print e
assert True assert True
...@@ -761,7 +736,7 @@ def test_setitem_matrix_bad_shape(): ...@@ -761,7 +736,7 @@ def test_setitem_matrix_bad_shape():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[1, 1, :] = b _a[1, 1, :] = b
assert False assert False
except ValueError as e: except ValueError:
# print e # print e
assert True assert True
...@@ -779,7 +754,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -779,7 +754,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:, :, 1] = _b _a[:, :, 1] = _b
assert False assert False
except ValueError as e: except ValueError:
# print e # print e
assert True assert True
...@@ -788,7 +763,7 @@ def test_setitem_matrix_bad_ndim(): ...@@ -788,7 +763,7 @@ def test_setitem_matrix_bad_ndim():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[1, :, :] = b _a[1, :, :] = b
assert False assert False
except ValueError as e: except ValueError:
# print e # print e
assert True assert True
...@@ -806,7 +781,7 @@ def test_setitem_matrix_bad_type(): ...@@ -806,7 +781,7 @@ def test_setitem_matrix_bad_type():
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[1, :, :] = b _a[1, :, :] = b
assert False assert False
except TypeError as e: except TypeError:
# print e # print e
assert True assert True
...@@ -832,8 +807,8 @@ def test_setitem_assign_to_slice(): ...@@ -832,8 +807,8 @@ def test_setitem_assign_to_slice():
# test direct transfert from numpy # test direct transfert from numpy
_d = _a[1, :, :] _d = _a[1, :, :]
_d[1, :] = b*10 _d[1, :] = b * 10
a[1, :, :][1, :] = b*10 a[1, :, :][1, :] = b * 10
assert numpy.allclose(a, numpy.asarray(_a)) assert numpy.allclose(a, numpy.asarray(_a))
...@@ -923,7 +898,7 @@ def test_setitem_rightvalue_ndarray_fails(): ...@@ -923,7 +898,7 @@ def test_setitem_rightvalue_ndarray_fails():
b = theano._asarray([7, 8, 9, 10], dtype='float32') b = theano._asarray([7, 8, 9, 10], dtype='float32')
_b = cuda_ndarray.CudaNdarray(b) _b = cuda_ndarray.CudaNdarray(b)
b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32') b5 = theano._asarray([7, 8, 9, 10, 11], dtype='float32')
_b5 = cuda_ndarray.CudaNdarray(b) cuda_ndarray.CudaNdarray(b)
# attempt to assign the ndarray b with setitem # attempt to assign the ndarray b with setitem
_a[:, :, 1] = _b _a[:, :, 1] = _b
...@@ -941,9 +916,9 @@ def test_setitem_rightvalue_ndarray_fails(): ...@@ -941,9 +916,9 @@ def test_setitem_rightvalue_ndarray_fails():
# without same number of dim # without same number of dim
try: try:
_a[0, :, :] = mat _a[0, :, :] = mat
#a[0, :, :] = mat # a[0, :, :] = mat
#assert numpy.allclose(numpy.asarray(_a), a) # assert numpy.allclose(numpy.asarray(_a), a)
except ValueError as e: except ValueError:
pass pass
# test direct transfert from numpy with broadcast # test direct transfert from numpy with broadcast
...@@ -964,7 +939,7 @@ def test_zeros_basic(): ...@@ -964,7 +939,7 @@ def test_zeros_basic():
_n = numpy.zeros(shp, dtype="float32") _n = numpy.zeros(shp, dtype="float32")
assert numpy.allclose(numpy.asarray(_a), _n) assert numpy.allclose(numpy.asarray(_a), _n)
assert _a.shape == _n.shape assert _a.shape == _n.shape
assert all(_a._strides == numpy.asarray(_n.strides)/4) assert all(_a._strides == numpy.asarray(_n.strides) / 4)
# TODO:The following don't have the same stride! # TODO:The following don't have the same stride!
# This should be fixed with the new GpuNdArray. # This should be fixed with the new GpuNdArray.
...@@ -1039,10 +1014,7 @@ def test_is_c_contiguous(): ...@@ -1039,10 +1014,7 @@ def test_is_c_contiguous():
assert not a[::2].is_c_contiguous() assert not a[::2].is_c_contiguous()
if __name__ == '__main__': if __name__ == '__main__':
test_zeros_basic_3d_tensor()
test_zeros_basic_vector()
test_setitem_matrixvector1() test_setitem_matrixvector1()
test_setitem_matrix_tensor3() test_setitem_matrix_tensor3()
test_setitem_broadcast_must_fail()
test_setitem_assign_to_slice() test_setitem_assign_to_slice()
test_setitem_rightvalue_ndarray_fails() test_setitem_rightvalue_ndarray_fails()
...@@ -6,7 +6,7 @@ import theano ...@@ -6,7 +6,7 @@ import theano
try: try:
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
except ImportError: except ImportError:
# To have the GPU back-end work without nose, we need this file to # To have the GPU back-end work without nose, we need this file to
...@@ -33,8 +33,9 @@ def test_nvidia_driver1(): ...@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 2
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1: if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' + msg = '\n\t'.join(
'but got:']+[str(app) for app in topo]) ['Expected exactly one occurrence of GpuCAReduce ' +
'but got:'] + [str(app) for app in topo])
raise AssertionError(msg) raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()): if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS " raise Exception("The nvidia driver version installed with this OS "
......
...@@ -5,24 +5,22 @@ import itertools ...@@ -5,24 +5,22 @@ import itertools
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy as np import numpy as np
from six.moves import xrange from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False: if cuda_ndarray.cuda_available:
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
else:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu mode = mode_with_gpu
...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i])) utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32") a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum # Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,), a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32") dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a_shape = [5, 5] a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks # Use multiple GPU gridblocks
a_shape = [4, 4] a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1 a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5) utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum # Use recursive cumsum
a_shape = [3, 3] a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self): def test_GpuCumsum3D(self):
...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis) # Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2] a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis) # Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis) # Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3] a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype(
"float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self): def test_GpuCumsum4D(self):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import unittest import unittest
import numpy import numpy
import copy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import ( from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs) GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from theano.sandbox.cuda.basic_ops import gpu_contiguous from theano.sandbox.cuda.basic_ops import gpu_contiguous
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase): ...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs = shared(inputs_val) inputs = shared(inputs_val)
filters = shared(filters_val) filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample, conv = theano.tensor.nnet.convTransp3D(W=filters,
b=bias,
d=subsample,
H=inputs) H=inputs)
f_ref = theano.function([], conv) f_ref = theano.function([], conv)
res_ref = f_ref() res_ref = f_ref()
......
...@@ -8,7 +8,7 @@ from theano.sandbox import cuda ...@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
......
...@@ -11,7 +11,7 @@ from theano import ifelse ...@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0): ...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated() theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
return ("(n malloc/theano mem allocated in KB)", return ("(n malloc/theano mem allocated in KB)",
n_mallocs + extra_alloc, n_mallocs + extra_alloc,
int(theano_alloc / 1024) + extra_size) int(theano_alloc / 1024))
return ("n malloc on the gpu", n_mallocs + extra_alloc) return ("n malloc on the gpu", n_mallocs + extra_alloc)
# I don't use the following by default as if there is other stuff running # I don't use the following by default as if there is other stuff running
...@@ -83,9 +83,12 @@ def test_memory(): ...@@ -83,9 +83,12 @@ def test_memory():
variables = cuda.shared_constructor(np.ones((shapes[1],), variables = cuda.shared_constructor(np.ones((shapes[1],),
dtype='float32')) dtype='float32'))
derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
borrow=True, np.prod(variables.get_value(
return_internal_type=True).shape) * 4 / 1024, "kB") borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
...@@ -112,7 +115,7 @@ def test_memory(): ...@@ -112,7 +115,7 @@ def test_memory():
del obj del obj
# print "After deleting function 1", freemem() # print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem()) # assert mem2 == freemem(), (mem2, freemem())
del grad del grad
print("After deleting function 2", freemem()) print("After deleting function 2", freemem())
...@@ -155,16 +158,19 @@ def test_memory_lazy(): ...@@ -155,16 +158,19 @@ def test_memory_lazy():
derp = ifelse.IfElse(1)(branch_select, derp = ifelse.IfElse(1)(branch_select,
derp, some_matrix[:shapes[0]].sum()) derp, some_matrix[:shapes[0]].sum())
derp += 1 derp += 1
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
borrow=True, np.prod(variables.get_value(
return_internal_type=True).shape) * 4 / 1024, "kB") borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
mem2_1 = freemem(extra_alloc=more_alloc1) mem2_1 = freemem(extra_alloc=more_alloc1)
obj = theano.function([some_vector, branch_select], derp, obj = theano.function([some_vector, branch_select], derp,
mode=mode_with_gpu) mode=mode_with_gpu)
#theano.printing.debugprint(obj, print_type=True) # theano.printing.debugprint(obj, print_type=True)
mem3 = freemem() mem3 = freemem()
print("After function compilation 1", mem3) print("After function compilation 1", mem3)
assert mem2_1 == mem3, (mem2_1, mem3) assert mem2_1 == mem3, (mem2_1, mem3)
......
...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']: ...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!') 'otherwise it is too slow!')
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
if tcn.cuda_available == False: if tcn.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -68,7 +68,7 @@ def print_mode(mode): ...@@ -68,7 +68,7 @@ def print_mode(mode):
def print_diff_mode(a, b): def print_diff_mode(a, b):
if (a is not None and if (a is not None and
isinstance(a, (theano.compile.ProfileMode,)) and isinstance(a, (theano.compile.ProfileMode,)) and
isinstance(b, (theano.compile.ProfileMode,))): isinstance(b, (theano.compile.ProfileMode,))):
a.print_diff_summary(b) a.print_diff_summary(b)
...@@ -138,8 +138,8 @@ def test_run_nnet(): ...@@ -138,8 +138,8 @@ def test_run_nnet():
# print "cpu:", rval_cpu # print "cpu:", rval_cpu
# print "gpu:", rval_gpu # print "gpu:", rval_gpu
abs_diff, rel_diff = \ abs_diff, rel_diff = \
theano.gradient.numeric_grad.abs_rel_err(rval_gpu, theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
rval_cpu) rval_cpu)
max_abs_diff = abs_diff.max() max_abs_diff = abs_diff.max()
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % ( # print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# max_abs_diff, rel_diff.max(), n_in, n_hid) # max_abs_diff, rel_diff.max(), n_in, n_hid)
...@@ -147,19 +147,20 @@ def test_run_nnet(): ...@@ -147,19 +147,20 @@ def test_run_nnet():
rtol = 1e-4 rtol = 1e-4
if n_in * n_hid >= 2048 * 4096: if n_in * n_hid >= 2048 * 4096:
rtol = 7e-4 rtol = 7e-4
assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \ assert numpy.allclose(
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff, rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
rel_diff.max(), n_in, n_hid) ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid)
def test_run_nnet_med(): def test_run_nnet_med():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000) run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small(): def test_run_nnet_small():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000) run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu): def run_conv_nnet1(use_gpu):
...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu): ...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2, conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1) logical_hid_shape[1] // 2),
shape_kern1[2:],
n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))) hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle(( hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if downsample_ops: if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))) hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
else: else:
hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x') hid = tensor.tanh(
))[:, :, ::2, ::2]) (conv_op(x, w0) + b0.dimshuffle(
(0, 'x', 'x')))[:, :, ::2, ::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x'))) hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
hid_flat = hid1.reshape((n_batch, n_hid)) hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c) out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
tensor.argmax(y, axis=1)) * lr) out, tensor.argmax(y, axis=1)) * lr)
# print 'loss type', loss.type # print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c] params = [w0, b0, w1, b1, v, c]
...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode = get_mode(use_gpu, check_isfinite) mode = get_mode(use_gpu, check_isfinite)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
if verbose: if verbose:
theano.printing.debugprint(train) theano.printing.debugprint(train)
...@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize, ...@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train, params, x_shape, y_shape, mode = build_conv_nnet2_classif( train, params, x_shape, y_shape, mode = build_conv_nnet2_classif(
use_gpu=use_gpu, use_gpu=use_gpu,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
if use_gpu: if use_gpu:
device = 'GPU' device = 'GPU'
...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize, ...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr = theano._asarray(0.01, dtype='float32') lr = theano._asarray(0.01, dtype='float32')
rvals = my_zeros(n_train) rvals = my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train): for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0] rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode) print_mode(mode)
if pickle and isinstance(mode, theano.compile.ProfileMode): if pickle and isinstance(mode, theano.compile.ProfileMode):
...@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare = True compare = True
if not compare: if not compare:
return run_conv_nnet2_classif(use_gpu=use_gpu, return run_conv_nnet2_classif(
seed=seed, isize=isize, ksize=ksize, bsize=bsize, use_gpu=use_gpu,
n_train=n_train, seed=seed, isize=isize, ksize=ksize, bsize=bsize,
check_isfinite=check_isfinite, n_train=n_train,
pickle=pickle, check_isfinite=check_isfinite,
verbose=verbose, pickle=pickle,
version=version) verbose=verbose,
version=version)
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \ train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \
build_conv_nnet2_classif( build_conv_nnet2_classif(
use_gpu=False, use_gpu=False,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \ train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \
build_conv_nnet2_classif( build_conv_nnet2_classif(
use_gpu=True, use_gpu=True,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
assert x_shape == x_shape_gpu assert x_shape == x_shape_gpu
assert y_shape == y_shape_gpu assert y_shape == y_shape_gpu
...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally: finally:
theano.tensor.basic.float32_atol = orig_float32_atol theano.tensor.basic.float32_atol = orig_float32_atol
if pickle:
if isinstance(cpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN CPU profile mode dump")
print(pickle.dumps(cpu_mode))
print("END CPU profile mode dump")
if isinstance(gpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN GPU profile mode dump")
print(pickle.dumps(gpu_mode))
print("END GPU profile mode dump")
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % ( # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu)) # (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % ( # print "Estimated time for one pass through MNIST with CPU: %f" % (
......
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import unittest
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import operator import operator
import sys import sys
import unittest
import numpy import numpy
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
...@@ -9,39 +8,28 @@ from nose.plugins.skip import SkipTest ...@@ -9,39 +8,28 @@ from nose.plugins.skip import SkipTest
from nose.tools import assert_raises from nose.tools import assert_raises
import theano import theano
import theano.sandbox.cuda.cula as cula
from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv
from six.moves import reduce from six.moves import reduce
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import config, tensor from theano import config, tensor
import theano.tensor.tests.test_nlinalg import theano.tensor.tests.test_nlinalg
import theano.tensor.tests.test_opt as test_opt import theano.tensor.tests.test_opt as test_opt
from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv
from theano.sandbox.cuda.blocksparse import GpuSparseBlockOuter
from theano.tests.breakpoint import PdbBreakpoint from theano.tests.breakpoint import PdbBreakpoint
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
import theano.tests.test_ifelse
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
if not cuda.cuda_available: if not cuda.cuda_available:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.sandbox.cuda.cula as cula
from theano.sandbox.cuda import basic_ops
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.scalar.basic_scipy import erfinv
from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.sandbox.cuda.blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
imported_scipy_special = False
try:
import scipy.special
imported_scipy_special = True
# Importing scipy.special may raise ValueError.
# See http://projects.scipy.org/scipy/ticket/1739
except (ImportError, ValueError):
pass
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu') mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpu')
...@@ -136,7 +124,7 @@ def test_local_assert_no_cpu_op(): ...@@ -136,7 +124,7 @@ def test_local_assert_no_cpu_op():
config.on_opt_error = 'ignore' config.on_opt_error = 'ignore'
assert_raises(AssertionError, theano.function, assert_raises(AssertionError, theano.function,
[], out, mode=mode_local_assert) [], out, mode=mode_local_assert)
finally: finally:
config.assert_no_cpu_op = old config.assert_no_cpu_op = old
config.on_opt_error = old2 config.on_opt_error = old2
...@@ -152,7 +140,7 @@ def test_local_assert_no_cpu_op(): ...@@ -152,7 +140,7 @@ def test_local_assert_no_cpu_op():
def test_int_pow(): def test_int_pow():
a = CudaNdarrayType([False])() a = CudaNdarrayType([False])()
f = theano.function([a], (a*4).sum(), mode=mode_with_gpu) f = theano.function([a], (a * 4).sum(), mode=mode_with_gpu)
op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()] op_names = [n.op.__class__.__name__ for n in f.maker.fgraph.toposort()]
assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu'] assert op_names == ['GpuCAReduce', 'GpuElemwise', 'HostFromGpu']
...@@ -175,23 +163,30 @@ def test_gpualloc(): ...@@ -175,23 +163,30 @@ def test_gpualloc():
x = theano.shared(numpy.ones(3, dtype='float32'), 'x') x = theano.shared(numpy.ones(3, dtype='float32'), 'x')
m = (x).dimshuffle(['x', 0]) m = (x).dimshuffle(['x', 0])
v = tensor.alloc(1., *m.shape) v = tensor.alloc(1., *m.shape)
f = theano.function([], v + x, f = theano.function([],
mode=mode_with_gpu.excluding("local_elemwise_alloc")) v + x,
mode=mode_with_gpu.excluding(
"local_elemwise_alloc"))
l = f.maker.fgraph.toposort() l = f.maker.fgraph.toposort()
assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for x in l]) assert numpy.any([isinstance(x.op, cuda.GpuAlloc) for y in l])
def test_gpuallocempty(): def test_gpuallocempty():
f_gpu = theano.function([], tensor.AllocEmpty('float32')(2,3), f_gpu = theano.function(
mode=mode_with_gpu) [],
tensor.AllocEmpty('float32')(2, 3),
mode=mode_with_gpu)
l_gpu = f_gpu.maker.fgraph.toposort() l_gpu = f_gpu.maker.fgraph.toposort()
assert numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu]) assert numpy.any(
[isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_gpu])
f_cpu = theano.function([], tensor.AllocEmpty('int32')(2,3)) f_cpu = theano.function([], tensor.AllocEmpty('int32')(2, 3))
l_cpu = f_cpu.maker.fgraph.toposort() l_cpu = f_cpu.maker.fgraph.toposort()
assert not numpy.any([isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu]) assert not numpy.any(
[isinstance(x.op, basic_ops.GpuAllocEmpty) for x in l_cpu])
class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc): class Test_local_elemwise_alloc(test_opt.Test_local_elemwise_alloc):
dtype = 'float32' dtype = 'float32'
...@@ -269,7 +264,8 @@ def test_gpuspecifyshape(): ...@@ -269,7 +264,8 @@ def test_gpuspecifyshape():
f = theano.function([], updates=[(x, m * numpy.float32(2))], f = theano.function([], updates=[(x, m * numpy.float32(2))],
mode=mode_with_gpu) mode=mode_with_gpu)
l = f.maker.fgraph.toposort() l = f.maker.fgraph.toposort()
assert not numpy.any([isinstance(x.op, cuda.HostFromGpu) for x in l]) assert not numpy.any(
[isinstance(x.op, cuda.HostFromGpu) for y in l])
def test_softmax(): def test_softmax():
...@@ -430,7 +426,7 @@ def test_local_gpu_subtensor(): ...@@ -430,7 +426,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input # Test multiple use of the input
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix() t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1], mode=mode_with_gpu) f = theano.function([t], [t[3:4], t + 1], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
...@@ -438,7 +434,7 @@ def test_local_gpu_subtensor(): ...@@ -438,7 +434,7 @@ def test_local_gpu_subtensor():
# Test multiple use of the input + input as output # Test multiple use of the input + input as output
# We want the subtensor to be on the GPU to prevent multiple transfer. # We want the subtensor to be on the GPU to prevent multiple transfer.
t = tensor.fmatrix() t = tensor.fmatrix()
f = theano.function([t], [t[3:4], t+1, t], mode=mode_with_gpu) f = theano.function([t], [t[3:4], t + 1, t], mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert not any([type(node.op) is tensor.Subtensor for node in topo]) assert not any([type(node.op) is tensor.Subtensor for node in topo])
assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
...@@ -446,7 +442,7 @@ def test_local_gpu_subtensor(): ...@@ -446,7 +442,7 @@ def test_local_gpu_subtensor():
# Test shared forced on CPU end we do computation on the output of # Test shared forced on CPU end we do computation on the output of
# the subtensor. # the subtensor.
t = tensor._shared(numpy.zeros(20, "float32")) t = tensor._shared(numpy.zeros(20, "float32"))
f = theano.function([], t[3:4]+1, mode=mode_with_gpu) f = theano.function([], t[3:4] + 1, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, cuda.GpuSubtensor) for node in topo])
...@@ -507,10 +503,11 @@ def test_local_gpu_split(): ...@@ -507,10 +503,11 @@ def test_local_gpu_split():
def test_print_op(): def test_print_op():
""" Test that print ops don't block gpu optimization""" """ Test that print ops don't block gpu optimization"""
b = tensor.fmatrix() b = tensor.fmatrix()
f = theano.function([b], theano.printing.Print()(b)*2, mode=mode_with_gpu) f = theano.function(
[b], theano.printing.Print()(b) * 2, mode=mode_with_gpu)
# theano.printing.debugprint(f) # theano.printing.debugprint(f)
# print f.maker.fgraph.toposort() # print f.maker.fgraph.toposort()
#[GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)] # [GpuFromHost(<TensorType(float32, matrix)>), <theano.printing.Print object at 0x3581210>(GpuFromHost.0), GpuElemwise{mul}(CudaNdarray{[[ 2.]]}, <theano.printing.Print object at 0x3581210>.0), HostFromGpu(GpuElemwise{mul}.0)]
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert topo[0].op == cuda.gpu_from_host assert topo[0].op == cuda.gpu_from_host
assert isinstance(topo[1].op, theano.printing.Print) assert isinstance(topo[1].op, theano.printing.Print)
...@@ -563,8 +560,10 @@ def test_huge_elemwise_fusion(): ...@@ -563,8 +560,10 @@ def test_huge_elemwise_fusion():
bytes limits. bytes limits.
""" """
shape = (2, 3, 4, 5, 6) shape = (2, 3, 4, 5, 6)
ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape)) ttype = tensor.tensor(dtype='float32',
gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()['gpu_ptr_size'] broadcastable=(False,) * len(shape))
gpu_ptr_size = theano.sandbox.cuda.opt.get_device_type_sizes()[
'gpu_ptr_size']
if gpu_ptr_size == 8: if gpu_ptr_size == 8:
nb_in = 7 nb_in = 7
len_topo = 10 len_topo = 10
...@@ -582,14 +581,19 @@ def test_huge_elemwise_fusion(): ...@@ -582,14 +581,19 @@ def test_huge_elemwise_fusion():
assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub) assert isinstance(topo[-3].op.scalar_op, theano.scalar.basic.Sub)
assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite) assert isinstance(topo[-2].op.scalar_op, theano.scalar.basic.Composite)
# let debugmode catch errors # let debugmode catch errors
gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32') # gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
def gen():
return(
theano._asarray(numpy.random.rand(*shape), dtype='float32'))
f(*[gen() for i in range(nb_in)]) f(*[gen() for i in range(nb_in)])
# Test the case where we can't put the computation on the gpu! their is too # Test the case where we can't put the computation on the gpu! their is too
# many dimensions to the input to have 2 inputs to the op! # many dimensions to the input to have 2 inputs to the op!
shape = (1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2,) shape = (1, 2, 3, 4, 5, 6, 7, 2, 2, 3, 2, 1, 2, 2, 2,)
ttype = tensor.tensor(dtype='float32', broadcastable=(False,) * len(shape)) ttype = tensor.tensor(
dtype='float32', broadcastable=(False,) * len(shape))
vars = [tensor.tanh(ttype) for x in range(7)] vars = [tensor.tanh(ttype) for x in range(7)]
f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] - f = pfunc(vars, [vars[0] - vars[1] - vars[2] - vars[3] - vars[4] -
vars[5] - vars[6]], mode=mode_with_gpu) vars[5] - vars[6]], mode=mode_with_gpu)
...@@ -598,7 +602,9 @@ def test_huge_elemwise_fusion(): ...@@ -598,7 +602,9 @@ def test_huge_elemwise_fusion():
assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0 assert sum([isinstance(node.op, cuda.GpuElemwise) for node in topo]) == 0
assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1 assert sum([isinstance(node.op, tensor.Elemwise) for node in topo]) == 1
# let debugmode catch errors # let debugmode catch errors
gen = lambda: theano._asarray(numpy.random.rand(*shape), dtype='float32')
def gen():
return(theano._asarray(numpy.random.rand(*shape), dtype='float32'))
f(gen(), gen(), gen(), gen(), gen(), gen(), gen()) f(gen(), gen(), gen(), gen(), gen(), gen(), gen())
def gen(shape): def gen(shape):
...@@ -611,9 +617,9 @@ def test_huge_elemwise_fusion(): ...@@ -611,9 +617,9 @@ def test_huge_elemwise_fusion():
(2, 2, 2, 2), (2, 2, 2, 2),
(2, 2, 2, 2, 2), # 5d (2, 2, 2, 2, 2), # 5d
(2, 2, 2, 2, 2, 2), (2, 2, 2, 2, 2, 2),
# (2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2),
# (2, 2, 2, 2, 2, 2, 2, 2), # (2, 2, 2, 2, 2, 2, 2, 2),
# (2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d # (2, 2, 2, 1, 1, 1, 1, 2, 2), # 9d
]: ]:
vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)] vals = [cuda.shared_constructor(gen(shape)) for x in range(max_var)]
for use_tan in [True, False]: for use_tan in [True, False]:
...@@ -676,7 +682,9 @@ def test_local_gpu_elemwise_0(): ...@@ -676,7 +682,9 @@ def test_local_gpu_elemwise_0():
a = tensor.fmatrix() a = tensor.fmatrix()
from theano.scalar.basic import identity from theano.scalar.basic import identity
out_s = theano.scalar.Composite([a_s, b_s, c_s], out_s = theano.scalar.Composite([a_s, b_s, c_s],
[identity(a_s), identity(c_s), identity(b_s)]) [identity(a_s),
identity(c_s),
identity(b_s)])
outs_op = tensor.Elemwise(out_s) outs_op = tensor.Elemwise(out_s)
f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
...@@ -725,9 +733,6 @@ def test_elemwise_fusion(): ...@@ -725,9 +733,6 @@ def test_elemwise_fusion():
theano._asarray(numpy.random.rand(*shape), dtype='float32')) theano._asarray(numpy.random.rand(*shape), dtype='float32'))
import theano.tests.test_ifelse
class TestIfElse(theano.tests.test_ifelse.test_ifelse): class TestIfElse(theano.tests.test_ifelse.test_ifelse):
dtype = "float32" dtype = "float32"
mode = mode_with_gpu mode = mode_with_gpu
...@@ -765,15 +770,17 @@ def test_incsubtensor_mixed(): ...@@ -765,15 +770,17 @@ def test_incsubtensor_mixed():
def test_erfinvgpu(): def test_erfinvgpu():
""" Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """ """ Test that local_gpu_elemwise_0 replaces Erfinv with ErfinvGPU """
x = tensor.fmatrix() x = tensor.fmatrix()
f = theano.function([x], tensor.Elemwise(erfinv)(x), mode=mode_with_gpu) f = theano.function([x],
f2 = theano.function([x], tensor.Elemwise(erfinv)(x), tensor.Elemwise(erfinv)(x),
mode=mode_without_gpu) mode=mode_with_gpu)
assert isinstance(f.maker.fgraph.toposort()[1].op, cuda.GpuElemwise) theano.function([x],
tensor.Elemwise(erfinv)(x),
mode=mode_without_gpu)
assert isinstance(f.maker.fgraph.toposort()[1].op,
cuda.GpuElemwise)
assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op, assert isinstance(f.maker.fgraph.toposort()[1].op.scalar_op,
cuda.elemwise.ErfinvGPU) cuda.elemwise.ErfinvGPU)
xv = numpy.random.rand(7, 8).astype('float32') numpy.random.rand(7, 8).astype('float32')
if imported_scipy_special:
assert numpy.allclose(f(xv), f2(xv))
def test_local_gpu_solve(): def test_local_gpu_solve():
...@@ -887,10 +894,10 @@ def test_local_abstractconv_gemm(): ...@@ -887,10 +894,10 @@ def test_local_abstractconv_gemm():
image = tensor.ftensor4() image = tensor.ftensor4()
W = tensor.ftensor4() W = tensor.ftensor4()
conv = tensor.nnet.conv2d(image, conv = tensor.nnet.conv2d(image,
W, W,
input_shape=(1, 32, 32, 32), input_shape=(1, 32, 32, 32),
filter_shape=(32, 32, 3, 3), filter_shape=(32, 32, 3, 3),
border_mode='half') border_mode='half')
f = theano.function([image, W], [conv], mode=mode_with_gpu) f = theano.function([image, W], [conv], mode=mode_with_gpu)
f(numpy.random.rand(1, 32, 32, 32).astype('float32'), f(numpy.random.rand(1, 32, 32, 32).astype('float32'),
numpy.random.rand(32, 32, 3, 3).astype('float32')) numpy.random.rand(32, 32, 3, 3).astype('float32'))
......
...@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams ...@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
# The PyCObject that represents the cuda random stream object # The PyCObject that represents the cuda random stream object
...@@ -168,13 +168,13 @@ def compare_speed(): ...@@ -168,13 +168,13 @@ def compare_speed():
dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX)) dest = theano.shared(numpy.zeros(N, dtype=theano.config.floatX))
mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))}, mrg_u = theano.function([], [], updates={dest: mrg.uniform((N,))},
profile='mrg uniform') profile='mrg uniform')
crn_u = theano.function([], [], updates={dest: crn.uniform((N,))}, crn_u = theano.function([], [], updates={dest: crn.uniform((N,))},
profile='crn uniform') profile='crn uniform')
mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))}, mrg_n = theano.function([], [], updates={dest: mrg.normal((N,))},
profile='mrg normal') profile='mrg normal')
crn_n = theano.function([], [], updates={dest: crn.normal((N,))}, crn_n = theano.function([], [], updates={dest: crn.normal((N,))},
profile='crn normal') profile='crn normal')
for f in mrg_u, crn_u, mrg_n, crn_n: for f in mrg_u, crn_u, mrg_n, crn_n:
# don't time the first call, it has some startup cost # don't time the first call, it has some startup cost
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray. This file test tensor op that should also operate on CudaNdaray.
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import copy
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy import numpy
...@@ -14,7 +13,7 @@ import theano.tensor as T ...@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
from theano.tensor.nnet.tests import test_conv3d2d from theano.tensor.nnet.tests import test_conv3d2d
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -57,7 +56,7 @@ def test_softmax_optimizations(): ...@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n = tensor.lvector('one_of_n') one_of_n = tensor.lvector('one_of_n')
op = crossentropy_categorical_1hot op = crossentropy_categorical_1hot
xe = op(x, one_of_n) op(x, one_of_n)
fgraph = theano.gof.FunctionGraph( fgraph = theano.gof.FunctionGraph(
[x, one_of_n], [x, one_of_n],
...@@ -84,10 +83,10 @@ def test_may_share_memory_cuda(): ...@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented # can't test the transpose as ta._strides = is not implemented
# manual transpose of a # manual transpose of a
#ta = a.reshape((4,3)) # ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented # ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize # elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size # ta.gpudata += ta.size*elem_size
for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False), for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
(a, na, False), (b, nb, False), (a, na, False), (b, nb, False),
...@@ -95,8 +94,7 @@ def test_may_share_memory_cuda(): ...@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(a, va, True), (b, vb, True), (a, va, True), (b, vb, True),
(va, b, False), (a, vb, False), (va, b, False), (a, vb, False),
(a, ra, True), (b, rb, True), (a, ra, True), (b, rb, True),
(ra, b, False), (a, rb, False), (ra, b, False), (a, rb, False), ]:
]:
assert may_share_memory(a_, b_) == rep assert may_share_memory(a_, b_) == rep
assert may_share_memory(b_, a_) == rep assert may_share_memory(b_, a_) == rep
......
...@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc ...@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from theano.sandbox.cuda import CudaNdarrayType, cuda_available from theano.sandbox.cuda import CudaNdarrayType, cuda_available
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
if cuda_available == False: if cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -26,20 +26,19 @@ def test_float32_shared_constructor(): ...@@ -26,20 +26,19 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they # test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples # don't strictly have to be tuples
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=(True, False)).type, broadcastable=(True, False)).type,
CudaNdarrayType((True, False))) CudaNdarrayType((True, False)))
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=[True, False]).type, broadcastable=[True, False]).type,
CudaNdarrayType((True, False))) CudaNdarrayType((True, False)))
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=numpy.array([True, False])).type, broadcastable=numpy.array([True, False])).type,
CudaNdarrayType([True, False])) CudaNdarrayType([True, False]))
# test that we can make non-matrix shared vars # test that we can make non-matrix shared vars
assert eq( assert eq(f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type, CudaNdarrayType((False,) * 4))
CudaNdarrayType((False,) * 4))
def test_givens(): def test_givens():
...@@ -72,13 +71,14 @@ class T_updates(unittest.TestCase): ...@@ -72,13 +71,14 @@ class T_updates(unittest.TestCase):
# This test case uses code mentionned in #698 # This test case uses code mentionned in #698
data = numpy.random.rand(10, 10).astype('float32') data = numpy.random.rand(10, 10).astype('float32')
output_var = f32sc(name="output", output_var = f32sc(name="output",
value=numpy.zeros((10, 10), 'float32')) value=numpy.zeros((10, 10), 'float32'))
x = tensor.fmatrix('x') x = tensor.fmatrix('x')
output_updates = [(output_var, x ** 2)] output_updates = [(output_var, x ** 2)]
output_givens = {x: data} output_givens = {x: data}
output_func = theano.function(inputs=[], outputs=[], output_func = theano.function(
updates=output_updates, givens=output_givens) inputs=[], outputs=[],
updates=output_updates, givens=output_givens)
output_func() output_func()
def test_err_ndim(self): def test_err_ndim(self):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy import numpy
import unittest
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano import theano
...@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') ...@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def test_viewop_gpu(): def test_viewop_gpu():
from theano.sandbox import cuda from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
_x = theano.tensor.fvector('x') _x = theano.tensor.fvector('x')
x = cuda.gpu_from_host(_x) x = cuda.gpu_from_host(_x)
...@@ -19,6 +18,6 @@ def test_viewop_gpu(): ...@@ -19,6 +18,6 @@ def test_viewop_gpu():
out = cuda.host_from_gpu(_out) out = cuda.host_from_gpu(_out)
f = theano.function([x], f = theano.function([x],
out, out,
mode=mode_with_gpu) mode=mode_with_gpu)
data = numpy.array([1, 2, 3], dtype='float32') data = numpy.array([1, 2, 3], dtype='float32')
assert numpy.allclose(f(data), data) assert numpy.allclose(f(data), data)
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from __future__ import print_function from __future__ import print_function
import sys, time import sys
import time
from six import iteritems from six import iteritems
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import tensor from theano import tensor
...@@ -35,35 +36,47 @@ def showtimes(times): ...@@ -35,35 +36,47 @@ def showtimes(times):
def cmp_sigmoids(shape): def cmp_sigmoids(shape):
def numpy_sigmoid(input): def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input)) 1.0 / (1.0 + numpy.exp(-input))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))() sinput = tensor.Tensor(
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input') dtype='float32', broadcastable=(0,) * len(shape))()
times = compare_fns( shared_input = tcn.shared_constructor(
dict( numpy=numpy_sigmoid numpy.random.rand(*shape),
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))) 'shared_input')
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))]) times = compare_fns(dict(
), numpy=numpy_sigmoid,
input=shared_input.value) theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))),
theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input)))])),
input=shared_input.value)
showtimes(times) showtimes(times)
def cmp_sigmoids_T(shape): def cmp_sigmoids_T(shape):
def numpy_sigmoid(input): def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input.T)) 1.0 / (1.0 + numpy.exp(-input.T))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))() sinput = tensor.Tensor(
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input') dtype='float32', broadcastable=(0,) * len(shape))()
times = compare_fns( shared_input = tcn.shared_constructor(
dict( numpy=numpy_sigmoid numpy.random.rand(*shape),
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))) 'shared_input')
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + times = compare_fns(dict(
tensor.exp(-shared_input.T)))]) numpy=numpy_sigmoid,
), theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))),
input=shared_input.value) theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input.T)))])),
input=shared_input.value)
showtimes(times) showtimes(times)
if __name__ == '__main__': if __name__ == '__main__':
eval(sys.argv[1]) eval(sys.argv[1])
# cmp_sigmoids((640, 64*64)) # looks great in profiler # cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49)) # cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49)) # cmp_sigmoids_T((173, 74*49))
...@@ -130,10 +130,10 @@ class CudaNdarrayType(Type): ...@@ -130,10 +130,10 @@ class CudaNdarrayType(Type):
type(data) is float and type(data) is float and
self.dtype == theano.config.floatX): self.dtype == theano.config.floatX):
return cuda.filter(converted_data, self.broadcastable, return cuda.filter(converted_data, self.broadcastable,
strict, old_data) strict, old_data)
elif numpy.all(data == converted_data): elif numpy.all(data == converted_data):
return cuda.filter(converted_data, self.broadcastable, return cuda.filter(converted_data, self.broadcastable,
strict, old_data) strict, old_data)
else: else:
raise TypeError( raise TypeError(
'%s, with dtype %s, cannot store accurately value %s, ' '%s, with dtype %s, cannot store accurately value %s, '
...@@ -259,8 +259,8 @@ class CudaNdarrayType(Type): ...@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64': (complex, 'theano_complex64', 'complex64': (complex, 'theano_complex64',
'NPY_COMPLEX64')}[self.dtype] 'NPY_COMPLEX64')}[self.dtype]
except KeyError: except KeyError:
raise TypeError("Unsupported dtype for %s: %s" % ( raise TypeError("Unsupported dtype for %s: %s" %
self.__class__.__name__, self.dtype)) (self.__class__.__name__, self.dtype))
def __eq__(self, other): def __eq__(self, other):
""" """
...@@ -271,10 +271,11 @@ class CudaNdarrayType(Type): ...@@ -271,10 +271,11 @@ class CudaNdarrayType(Type):
other.broadcastable == self.broadcastable) other.broadcastable == self.broadcastable)
def convert_variable(self, var): def convert_variable(self, var):
if (type(self) == type(var.type) and if (isinstance(self, type(var.type)) and
self.ndim == var.type.ndim and self.ndim == var.type.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable, all(sb == ob or ob for sb, ob in zip(
var.type.broadcastable))): self.broadcastable,
var.type.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
def __hash__(self): def __hash__(self):
...@@ -312,7 +313,7 @@ class CudaNdarrayType(Type): ...@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return self.name return self.name
else: else:
b = self.broadcastable b = self.broadcastable
#bcast = str(self.broadcastable) # bcast = str(self.broadcastable)
if not numpy.any(b): if not numpy.any(b):
s = "%iD" % len(b) s = "%iD" % len(b)
else: else:
...@@ -327,7 +328,7 @@ class CudaNdarrayType(Type): ...@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def __repr__(self): def __repr__(self):
return str(self) return str(self)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable)) # "CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def c_declare(self, name, sub, check_input=True): def c_declare(self, name, sub, check_input=True):
return """ CudaNdarray * %(name)s;""" % locals() return """ CudaNdarray * %(name)s;""" % locals()
...@@ -417,7 +418,7 @@ class CudaNdarrayType(Type): ...@@ -417,7 +418,7 @@ class CudaNdarrayType(Type):
return sio.getvalue() return sio.getvalue()
def c_extract_out(self, name, sub, check_input=True, check_broadcast=True): def c_extract_out(self, name, sub, check_input=True, check_broadcast=True):
""" """
To allow the hack to skip check_broadcast. To allow the hack to skip check_broadcast.
""" """
...@@ -528,13 +529,13 @@ theano.compile.ops.expandable_types += (CudaNdarrayType,) ...@@ -528,13 +529,13 @@ theano.compile.ops.expandable_types += (CudaNdarrayType,)
# Register C code for ViewOp on CudaNdarrayType # Register C code for ViewOp on CudaNdarrayType
theano.compile.register_view_op_c_code( theano.compile.register_view_op_c_code(
CudaNdarrayType, CudaNdarrayType,
""" """
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
%(oname)s = %(iname)s; %(oname)s = %(iname)s;
Py_XINCREF(%(oname)s); Py_XINCREF(%(oname)s);
""", """,
version=1) version=1)
theano.compile.register_shape_i_c_code( theano.compile.register_shape_i_c_code(
CudaNdarrayType, CudaNdarrayType,
...@@ -555,16 +556,15 @@ theano.compile.register_shape_i_c_code( ...@@ -555,16 +556,15 @@ theano.compile.register_shape_i_c_code(
# Register CudaNdarrayType to the DeepCopyOp list of types with c code. # Register CudaNdarrayType to the DeepCopyOp list of types with c code.
theano.compile.register_deep_copy_op_c_code( theano.compile.register_deep_copy_op_c_code(
CudaNdarrayType, CudaNdarrayType,
""" """
int alloc = %(oname)s == NULL; int alloc = %(oname)s == NULL;
for(int i=0; !alloc && i<CudaNdarray_NDIM(%(oname)s); i++) { for(int i=0; !alloc && i<CudaNdarray_NDIM(%(oname)s); i++) {
if(CudaNdarray_HOST_DIMS(%(iname)s)[i] != if(CudaNdarray_HOST_DIMS(%(iname)s)[i] !=
CudaNdarray_HOST_DIMS(%(oname)s)[i]) { CudaNdarray_HOST_DIMS(%(oname)s)[i]) {
alloc = true; alloc = true;
break; break;
} }}
}
if(alloc) { if(alloc) {
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
%(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s); %(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s);
...@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code( ...@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)s; %(fail)s;
} }
} }
""", """, version=3)
version=3)
# THIS WORKS But CudaNdarray instances don't compare equal to one # THIS WORKS But CudaNdarray instances don't compare equal to one
...@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda): ...@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported. # In case cuda is not imported.
if cuda is not None: if cuda is not None:
copyreg.pickle(cuda.CudaNdarray, CudaNdarray_pickler, copyreg.pickle(
CudaNdarray_unpickler) cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
...@@ -13,7 +13,7 @@ try: ...@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc # We must do those import to be able to create the full doc when nvcc
# is not available # is not available
from theano.sandbox.cuda import filter as type_support_filter from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost from theano.sandbox.cuda.basic_ops import HostFromGpu
except ImportError: except ImportError:
pass pass
...@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators): ...@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def _as_TensorVariable(self): def _as_TensorVariable(self):
return HostFromGpu()(self) return HostFromGpu()(self)
def _as_CudaNdarrayVariable(self): def _as_CudaNdarrayVariable(self):
return self return self
...@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature): ...@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class CudaNdarrayConstant(_operators, Constant): class CudaNdarrayConstant(_operators, Constant):
def signature(self): def signature(self):
return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data))) return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data)))
def __str__(self): def __str__(self):
if self.name is not None: if self.name is not None:
return self.name return self.name
...@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant): ...@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data = str(numpy.asarray(self.data)) data = str(numpy.asarray(self.data))
except Exception as e: except Exception as e:
data = "error while transferring the value: " + str(e) data = "error while transferring the value: " + str(e)
return "CudaNdarrayConstant{"+data+"}" return "CudaNdarrayConstant{" + data + "}"
CudaNdarrayType.Constant = CudaNdarrayConstant CudaNdarrayType.Constant = CudaNdarrayConstant
......
...@@ -87,42 +87,8 @@ whitelist_flake8 = [ ...@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py", "sandbox/tests/test_theano_object.py",
"sandbox/tests/test_scan.py", "sandbox/tests/test_scan.py",
"sandbox/tests/__init__.py", "sandbox/tests/__init__.py",
"sandbox/cuda/var.py",
"sandbox/cuda/GpuConvGrad3D.py",
"sandbox/cuda/basic_ops.py",
"sandbox/cuda/nnet.py",
"sandbox/cuda/elemwise.py",
"sandbox/cuda/type.py",
"sandbox/cuda/__init__.py", "sandbox/cuda/__init__.py",
"sandbox/cuda/opt.py",
"sandbox/cuda/blas.py",
"sandbox/cuda/blocksparse.py",
"sandbox/cuda/rng_curand.py",
"sandbox/cuda/fftconv.py",
"sandbox/cuda/kernel_codegen.py",
"sandbox/cuda/GpuConvTransp3D.py",
"sandbox/cuda/nvcc_compiler.py",
"sandbox/cuda/neighbours.py",
"sandbox/cuda/tests/__init__.py", "sandbox/cuda/tests/__init__.py",
"sandbox/cuda/tests/walltime.py",
"sandbox/cuda/tests/test_gradient.py",
"sandbox/cuda/tests/test_neighbours.py",
"sandbox/cuda/tests/test_conv_cuda_ndarray.py",
"sandbox/cuda/tests/test_var.py",
"sandbox/cuda/tests/test_opt.py",
"sandbox/cuda/tests/test_blas.py",
"sandbox/cuda/tests/test_driver.py",
"sandbox/cuda/tests/test_rng_curand.py",
"sandbox/cuda/tests/test_basic_ops.py",
"sandbox/cuda/tests/test_memory.py",
"sandbox/cuda/tests/test_mlp.py",
"sandbox/cuda/tests/test_bench_loopfusion.py",
"sandbox/cuda/tests/test_blocksparse.py",
"sandbox/cuda/tests/test_cuda_ndarray.py",
"sandbox/cuda/tests/test_tensor_op.py",
"sandbox/cuda/tests/test_extra_ops.py",
"sandbox/cuda/tests/test_gemmcorr3d.py",
"sandbox/cuda/tests/test_viewop.py",
"sandbox/gpuarray/tests/__init__.py", "sandbox/gpuarray/tests/__init__.py",
"sandbox/scan_module/scan_utils.py", "sandbox/scan_module/scan_utils.py",
"sandbox/scan_module/scan.py", "sandbox/scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论