提交 b69ad54d authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py
...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp): ...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5 broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype, outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()]) broadcastable=broad)()])
...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp): ...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2] outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3] outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4] outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype) dCdW = numpy.zeros(WShape, dtype=V.dtype)
...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp): ...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for p in xrange(0, outputHeight): for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth): for q in xrange(0, outputWidth):
for r in xrange(0, outputDur): for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m] dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW output_storage[0][0] = dCdW
......
...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp): ...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else: else:
RShape_ = T.as_tensor_variable([-1, -1, -1]) RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], return theano.Apply(
outputs=[CudaNdarrayType(dtype=H_.dtype, self, inputs=[W_, b_, d_, H_, RShape_],
broadcastable=(False,)*5)()]) outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs W, b, d, H, RShape = node.inputs
...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert dc > 0 assert dc > 0
assert dt > 0 assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1: if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight: if Rshape[0] < videoHeight:
...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else: # else:
# print "No Rshape passed in" # print "No Rshape passed in"
# print "video size: "+str((videoHeight, videoWidth, videoDur)) # print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight, R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize): for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize) # print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels): for j in xrange(0, inputChannels):
# print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels) # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight): for r in xrange(0, videoHeight):
# print '\t\t\trow '+str(r+1)+'/'+str(videoHeight) # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth): for c in xrange(0, videoWidth):
for t in xrange(0, videoDur): for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j] R[i, j, r, c, t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur + 1 )/float(dt))) ]) ftc = max(
fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ]) [0,
int(numpy.ceil(
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ]) float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight: while rc < outputHeight:
rk = r - rc * dr rk = r - rc * dr
if rk < 0: if rk < 0:
...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if tk < 0: if tk < 0:
break break
R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] ) R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1 tc += 1
"" # close loop over tc "" # close loop over tc
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import copy
import os import os
import logging import logging
_logger = logging.getLogger(__name__)
from six import integer_types from six import integer_types
from six.moves import StringIO, reduce from six.moves import StringIO, reduce
import theano import theano
from theano import Apply from theano import Apply
from theano import tensor from theano import tensor
...@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp ...@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable, from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous) gpu_contiguous)
from theano.tensor import as_tensor_variable from theano.tensor import as_tensor_variable
_logger = logging.getLogger(__name__)
class GpuBatchedDot(GpuOp): class GpuBatchedDot(GpuOp):
...@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp): ...@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
} }
} else { } else {
// copy inputs if not contiguous // copy inputs if not contiguous
""" + """ + ("\n".join("""
("\n".join("""
if (( CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1 if (( CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1 && CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1) && CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1)
...@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp): ...@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(%(var)s); Py_XDECREF(%(var)s);
%(var)s = _copy; %(var)s = _copy;
} }
""" % dict(var=var, fail=fail) for var in (bx, by))) """ % dict(var=var, fail=fail) for var in (bx, by))) + """
+ """
// fail if the output is not contiguous; we can't copy it because we // fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory // need to write to the original memory
...@@ -537,8 +532,8 @@ class GpuGemm(GpuOp): ...@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return 'GpuGemm{no_inplace}' return 'GpuGemm{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -562,7 +557,7 @@ class GpuGemm(GpuOp): ...@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return (4,) return (4,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
...@@ -657,8 +652,8 @@ class GpuGemv(GpuOp): ...@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return 'GpuGemv{no_inplace}' return 'GpuGemv{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -682,7 +677,7 @@ class GpuGemv(GpuOp): ...@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return (3,) return (3,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs z_in, a, x, y, b = inputs
...@@ -757,8 +752,8 @@ class GpuGer(GpuOp): ...@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return 'GpuGer{no_inplace}' return 'GpuGer{no_inplace}'
def __eq__(self, other): def __eq__(self, other):
return (type(self) == type(other)\ return (type(self) == type(other) and
and self.inplace == other.inplace) self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.inplace)
...@@ -782,7 +777,7 @@ class GpuGer(GpuOp): ...@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return (2,) return (2,)
def c_code(self, node, name, inputs, outputs, sub): def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in # z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in # inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out. # not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs z_in, a, x, y = inputs
...@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM): ...@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom, top = inp[:2] bottom, top = inp[:2]
weights, = grads weights, = grads
weights = gpu_contiguous(weights) weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)( d_bottom = GpuCorrMM_gradInputs(
weights, top, bottom.shape[-2:]) self.border_mode, self.subsample)(weights,
d_top = GpuCorrMM(self.border_mode, self.subsample)( top,
bottom, weights) bottom.shape[-2:])
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else () d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_bottom, d_top) + d_height_width return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM): ...@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights, top = inp[:2] weights, top = inp[:2]
bottom, = grads bottom, = grads
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)( d_weights = GpuCorrMM_gradWeights(
self.border_mode, self.subsample)(
bottom, top, weights.shape[-2:]) bottom, top, weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode, self.subsample)( d_top = GpuCorrMM(
bottom, weights) self.border_mode, self.subsample)(bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else () d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_weights, d_top) + d_height_width return (d_weights, d_top) + d_height_width
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM): ...@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom, weights = inp bottom, weights = inp
top, = grads top, = grads
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)( d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
weights, top, bottom.shape[-3:]) self.subsample,
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)( self.pad)(weights,
bottom, top, weights.shape[-3:]) top,
bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
return d_bottom, d_weights return d_bottom, d_weights
...@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM): ...@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights, top = inp[:2] weights, top = inp[:2]
bottom, = grads bottom, = grads
bottom = gpu_contiguous(bottom) bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)( d_weights = GpuCorr3dMM_gradWeights(
self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-3:]) bottom, top, weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)( d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.pad)(
bottom, weights) bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else () d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp): ...@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
# def perform(self, node, input_storage, output_storage): # def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented') # raise NotImplementedError('only C is implemented')
def c_code_cache_version(self): def c_code_cache_version(self):
return (6) return (6)
......
...@@ -5,9 +5,9 @@ import numpy as np ...@@ -5,9 +5,9 @@ import numpy as np
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse from theano.ifelse import ifelse
from theano.misc.pycuda_init import pycuda_available
if cuda_available: if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType, from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v, output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape, input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape) filter_shape=filters_fft_v_shape)
#output_fft_s = input_fft_v # output_fft_s = input_fft_v
# reshape for IFFT # reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2)) output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
#output = output_circ[:, :, :, :, :] # output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the # Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT. # trip through FFT and inverse FFT.
......
...@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
return [ return [ # get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount), inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
'float row_max = ' + buf + '[0]', 'float row_max = ' + buf + '[0]',
'__syncthreads()', '__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
'; __i+=' + threadCount + '){', threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)', buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]', buf2 + '[__i] = ' + buf + '[__i]', '}',
'}',
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
...@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
# divide each exp() result by the sum to complete the job. # divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N + 'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){', '; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'}',
'__syncthreads()', '__syncthreads()',
] ]
...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals()) init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" % loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos)) r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos)) r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
# This is work in progress # This is work in progress
from theano import Op, Apply, tensor from theano import Apply, tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
......
...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp): ...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel(
"kSoftmax_%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'const int sx0',
body=[ 'const int sx1',
"extern __shared__ float buf[]", 'float * sm',
'const int sm_s0',
'const int sm_s1'],
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]", "}", "__syncthreads()",
"}", inline_softmax('N',
"__syncthreads()", 'buf',
inline_softmax('N', 'buf', 'buf2', 'buf2',
'threadIdx.x', 'blockDim.x'), 'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly # This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"}", "__syncthreads()", "}", ])
"__syncthreads()", ret2 = nvcc_kernel(
"}", "kSoftmax_fixed_shared%s" % nodename,
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=["extern __shared__ float buf[]",
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]", "const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]", "float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'), 'threadIdx.x',
"__syncthreads()", 'blockDim.x'),
"}", "__syncthreads()", "}", ])
])
return ret1 + "\n" + ret2 return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0', 'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=["extern __shared__ float buf[]",
"extern __shared__ float buf[]",
"float * buf2 = buf + N", "float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;" "for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){", " blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] += b[tx * sb0]", "buf[tx] += b[tx * sb0]",
"buf2[tx] = buf[tx]", "buf2[tx] = buf[tx]", "}",
"}", "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
"__syncthreads()", 'threadIdx.x',
inline_softmax('N', 'buf', 'buf2', 'blockDim.x'),
'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"}", "__syncthreads()", "}", ])
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const float * x',
...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]", "float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr',
'sm_s1',
'threadIdx.x', 'threadIdx.x',
'blockDim.x', 'blockDim.x',
'b', 'sb0'), 'b', 'sb0'),
......
...@@ -4,7 +4,6 @@ import logging ...@@ -4,7 +4,6 @@ import logging
import os import os
import subprocess import subprocess
import sys import sys
import warnings
from locale import getpreferredencoding from locale import getpreferredencoding
import numpy import numpy
...@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler): ...@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
_logger.debug('Writing module C++ code to %s', cppfilename) _logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code) cppfile.write(src_code)
lib_filename = os.path.join(location, '%s.%s' % lib_filename = os.path.join(
location, '%s.%s' %
(module_name, get_lib_extension())) (module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename) _logger.debug('Generating shared lib %s', lib_filename)
...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler): ...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof = cmd.index('-u') indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u cmd.pop(indexof) # Remove argument to -u
except ValueError as e: except ValueError:
done = True done = True
# CUDA Toolkit v4.1 Known Issues: # CUDA Toolkit v4.1 Known Issues:
...@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler): ...@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
console_encoding = getpreferredencoding() console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding) nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding) nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally: finally:
os.chdir(orig_dir) os.chdir(orig_dir)
......
差异被折叠。
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
import numpy import numpy
import theano.gof import theano.gof
from theano.compat import PY3 from theano.compat import PY3
...@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt) ...@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, Variable from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config config = theano.config
...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp): ...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op. Return a tuple of attributes that define the Op.
""" """
return ( return (self.destructive,
self.destructive,
self.output_type, self.output_type,
self.seed, self.seed,
) )
...@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp): ...@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
v_size = theano.tensor.as_tensor_variable(size) v_size = theano.tensor.as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
self = cls( self = cls(output_type=CudaNdarrayType((False,) * ndim),
output_type=CudaNdarrayType((False,) * ndim),
seed=seed, seed=seed,
destructive=False) destructive=False)
...@@ -386,5 +383,5 @@ def local_destructive(node): ...@@ -386,5 +383,5 @@ def local_destructive(node):
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('CURAND_destructive', optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run', opt.in2out(local_destructive, ignore_newtrees=True),
'inplace') 99, 'fast_run', 'inplace')
...@@ -6,7 +6,7 @@ import theano ...@@ -6,7 +6,7 @@ import theano
try: try:
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
except ImportError: except ImportError:
# To have the GPU back-end work without nose, we need this file to # To have the GPU back-end work without nose, we need this file to
...@@ -33,8 +33,9 @@ def test_nvidia_driver1(): ...@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 2
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1: if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' + msg = '\n\t'.join(
'but got:']+[str(app) for app in topo]) ['Expected exactly one occurrence of GpuCAReduce ' +
'but got:'] + [str(app) for app in topo])
raise AssertionError(msg) raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()): if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS " raise Exception("The nvidia driver version installed with this OS "
......
...@@ -5,24 +5,22 @@ import itertools ...@@ -5,24 +5,22 @@ import itertools
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy as np import numpy as np
from six.moves import xrange from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False: if cuda_ndarray.cuda_available:
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
else:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu mode = mode_with_gpu
...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i])) utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32") a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum # Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,), a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32") dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a_shape = [5, 5] a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks # Use multiple GPU gridblocks
a_shape = [4, 4] a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1 a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5) utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum # Use recursive cumsum
a_shape = [3, 3] a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self): def test_GpuCumsum3D(self):
...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis) # Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2] a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis) # Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis) # Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3] a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype(
"float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self): def test_GpuCumsum4D(self):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import unittest import unittest
import numpy import numpy
import copy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import ( from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs) GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from theano.sandbox.cuda.basic_ops import gpu_contiguous from theano.sandbox.cuda.basic_ops import gpu_contiguous
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase): ...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs = shared(inputs_val) inputs = shared(inputs_val)
filters = shared(filters_val) filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample, conv = theano.tensor.nnet.convTransp3D(W=filters,
b=bias,
d=subsample,
H=inputs) H=inputs)
f_ref = theano.function([], conv) f_ref = theano.function([], conv)
res_ref = f_ref() res_ref = f_ref()
......
...@@ -8,7 +8,7 @@ from theano.sandbox import cuda ...@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
......
...@@ -11,7 +11,7 @@ from theano import ifelse ...@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0): ...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated() theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
return ("(n malloc/theano mem allocated in KB)", return ("(n malloc/theano mem allocated in KB)",
n_mallocs + extra_alloc, n_mallocs + extra_alloc,
int(theano_alloc / 1024) + extra_size) int(theano_alloc / 1024))
return ("n malloc on the gpu", n_mallocs + extra_alloc) return ("n malloc on the gpu", n_mallocs + extra_alloc)
# I don't use the following by default as if there is other stuff running # I don't use the following by default as if there is other stuff running
...@@ -83,9 +83,12 @@ def test_memory(): ...@@ -83,9 +83,12 @@ def test_memory():
variables = cuda.shared_constructor(np.ones((shapes[1],), variables = cuda.shared_constructor(np.ones((shapes[1],),
dtype='float32')) dtype='float32'))
derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
np.prod(variables.get_value(
borrow=True, borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB") return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
...@@ -112,7 +115,7 @@ def test_memory(): ...@@ -112,7 +115,7 @@ def test_memory():
del obj del obj
# print "After deleting function 1", freemem() # print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem()) # assert mem2 == freemem(), (mem2, freemem())
del grad del grad
print("After deleting function 2", freemem()) print("After deleting function 2", freemem())
...@@ -155,16 +158,19 @@ def test_memory_lazy(): ...@@ -155,16 +158,19 @@ def test_memory_lazy():
derp = ifelse.IfElse(1)(branch_select, derp = ifelse.IfElse(1)(branch_select,
derp, some_matrix[:shapes[0]].sum()) derp, some_matrix[:shapes[0]].sum())
derp += 1 derp += 1
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
np.prod(variables.get_value(
borrow=True, borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB") return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
mem2_1 = freemem(extra_alloc=more_alloc1) mem2_1 = freemem(extra_alloc=more_alloc1)
obj = theano.function([some_vector, branch_select], derp, obj = theano.function([some_vector, branch_select], derp,
mode=mode_with_gpu) mode=mode_with_gpu)
#theano.printing.debugprint(obj, print_type=True) # theano.printing.debugprint(obj, print_type=True)
mem3 = freemem() mem3 = freemem()
print("After function compilation 1", mem3) print("After function compilation 1", mem3)
assert mem2_1 == mem3, (mem2_1, mem3) assert mem2_1 == mem3, (mem2_1, mem3)
......
...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']: ...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!') 'otherwise it is too slow!')
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
if tcn.cuda_available == False: if tcn.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -147,19 +147,20 @@ def test_run_nnet(): ...@@ -147,19 +147,20 @@ def test_run_nnet():
rtol = 1e-4 rtol = 1e-4
if n_in * n_hid >= 2048 * 4096: if n_in * n_hid >= 2048 * 4096:
rtol = 7e-4 rtol = 7e-4
assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \ assert numpy.allclose(
rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff, ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid) rel_diff.max(), n_in, n_hid)
def test_run_nnet_med(): def test_run_nnet_med():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000) run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small(): def test_run_nnet_small():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000) run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu): def run_conv_nnet1(use_gpu):
...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu): ...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2, conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1) logical_hid_shape[1] // 2),
shape_kern1[2:],
n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))) hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle(( hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if downsample_ops: if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))) hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
else: else:
hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x') hid = tensor.tanh(
))[:, :, ::2, ::2]) (conv_op(x, w0) + b0.dimshuffle(
(0, 'x', 'x')))[:, :, ::2, ::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x'))) hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
hid_flat = hid1.reshape((n_batch, n_hid)) hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c) out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
tensor.argmax(y, axis=1)) * lr) out, tensor.argmax(y, axis=1)) * lr)
# print 'loss type', loss.type # print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c] params = [w0, b0, w1, b1, v, c]
...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode = get_mode(use_gpu, check_isfinite) mode = get_mode(use_gpu, check_isfinite)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
if verbose: if verbose:
theano.printing.debugprint(train) theano.printing.debugprint(train)
...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize, ...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr = theano._asarray(0.01, dtype='float32') lr = theano._asarray(0.01, dtype='float32')
rvals = my_zeros(n_train) rvals = my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train): for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0] rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode) print_mode(mode)
if pickle and isinstance(mode, theano.compile.ProfileMode): if pickle and isinstance(mode, theano.compile.ProfileMode):
...@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare = True compare = True
if not compare: if not compare:
return run_conv_nnet2_classif(use_gpu=use_gpu, return run_conv_nnet2_classif(
use_gpu=use_gpu,
seed=seed, isize=isize, ksize=ksize, bsize=bsize, seed=seed, isize=isize, ksize=ksize, bsize=bsize,
n_train=n_train, n_train=n_train,
check_isfinite=check_isfinite, check_isfinite=check_isfinite,
...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally: finally:
theano.tensor.basic.float32_atol = orig_float32_atol theano.tensor.basic.float32_atol = orig_float32_atol
if pickle:
if isinstance(cpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN CPU profile mode dump")
print(pickle.dumps(cpu_mode))
print("END CPU profile mode dump")
if isinstance(gpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN GPU profile mode dump")
print(pickle.dumps(gpu_mode))
print("END GPU profile mode dump")
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % ( # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu)) # (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % ( # print "Estimated time for one pass through MNIST with CPU: %f" % (
......
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import unittest
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......
...@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams ...@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
# The PyCObject that represents the cuda random stream object # The PyCObject that represents the cuda random stream object
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray. This file test tensor op that should also operate on CudaNdaray.
""" """
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import copy
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy import numpy
...@@ -14,7 +13,7 @@ import theano.tensor as T ...@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
from theano.tensor.nnet.tests import test_conv3d2d from theano.tensor.nnet.tests import test_conv3d2d
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -57,7 +56,7 @@ def test_softmax_optimizations(): ...@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n = tensor.lvector('one_of_n') one_of_n = tensor.lvector('one_of_n')
op = crossentropy_categorical_1hot op = crossentropy_categorical_1hot
xe = op(x, one_of_n) op(x, one_of_n)
fgraph = theano.gof.FunctionGraph( fgraph = theano.gof.FunctionGraph(
[x, one_of_n], [x, one_of_n],
...@@ -84,10 +83,10 @@ def test_may_share_memory_cuda(): ...@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented # can't test the transpose as ta._strides = is not implemented
# manual transpose of a # manual transpose of a
#ta = a.reshape((4,3)) # ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented # ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize # elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size # ta.gpudata += ta.size*elem_size
for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False), for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
(a, na, False), (b, nb, False), (a, na, False), (b, nb, False),
...@@ -95,8 +94,7 @@ def test_may_share_memory_cuda(): ...@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(a, va, True), (b, vb, True), (a, va, True), (b, vb, True),
(va, b, False), (a, vb, False), (va, b, False), (a, vb, False),
(a, ra, True), (b, rb, True), (a, ra, True), (b, rb, True),
(ra, b, False), (a, rb, False), (ra, b, False), (a, rb, False), ]:
]:
assert may_share_memory(a_, b_) == rep assert may_share_memory(a_, b_) == rep
assert may_share_memory(b_, a_) == rep assert may_share_memory(b_, a_) == rep
......
...@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc ...@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from theano.sandbox.cuda import CudaNdarrayType, cuda_available from theano.sandbox.cuda import CudaNdarrayType, cuda_available
import theano.sandbox.cuda as cuda import theano.sandbox.cuda as cuda
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
if cuda_available == False: if cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -26,19 +26,18 @@ def test_float32_shared_constructor(): ...@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they # test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples # don't strictly have to be tuples
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=(True, False)).type, broadcastable=(True, False)).type,
CudaNdarrayType((True, False))) CudaNdarrayType((True, False)))
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=[True, False]).type, broadcastable=[True, False]).type,
CudaNdarrayType((True, False))) CudaNdarrayType((True, False)))
assert eq( assert eq(f32sc(npy_row,
f32sc(npy_row, broadcastable=numpy.array([True, False])).type, broadcastable=numpy.array([True, False])).type,
CudaNdarrayType([True, False])) CudaNdarrayType([True, False]))
# test that we can make non-matrix shared vars # test that we can make non-matrix shared vars
assert eq( assert eq(f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
CudaNdarrayType((False,) * 4)) CudaNdarrayType((False,) * 4))
...@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase): ...@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
x = tensor.fmatrix('x') x = tensor.fmatrix('x')
output_updates = [(output_var, x ** 2)] output_updates = [(output_var, x ** 2)]
output_givens = {x: data} output_givens = {x: data}
output_func = theano.function(inputs=[], outputs=[], output_func = theano.function(
inputs=[], outputs=[],
updates=output_updates, givens=output_givens) updates=output_updates, givens=output_givens)
output_func() output_func()
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import numpy import numpy
import unittest
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano import theano
...@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') ...@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def test_viewop_gpu(): def test_viewop_gpu():
from theano.sandbox import cuda from theano.sandbox import cuda
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
_x = theano.tensor.fvector('x') _x = theano.tensor.fvector('x')
x = cuda.gpu_from_host(_x) x = cuda.gpu_from_host(_x)
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from __future__ import print_function from __future__ import print_function
import sys, time import sys
import time
from six import iteritems from six import iteritems
from theano.compile.pfunc import pfunc from theano.compile.pfunc import pfunc
from theano import tensor from theano import tensor
...@@ -35,35 +36,47 @@ def showtimes(times): ...@@ -35,35 +36,47 @@ def showtimes(times):
def cmp_sigmoids(shape): def cmp_sigmoids(shape):
def numpy_sigmoid(input): def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input)) 1.0 / (1.0 + numpy.exp(-input))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))() sinput = tensor.Tensor(
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input') dtype='float32', broadcastable=(0,) * len(shape))()
times = compare_fns( shared_input = tcn.shared_constructor(
dict( numpy=numpy_sigmoid numpy.random.rand(*shape),
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))) 'shared_input')
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))]) times = compare_fns(dict(
), numpy=numpy_sigmoid,
theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))),
theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input)))])),
input=shared_input.value) input=shared_input.value)
showtimes(times) showtimes(times)
def cmp_sigmoids_T(shape): def cmp_sigmoids_T(shape):
def numpy_sigmoid(input): def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input.T)) 1.0 / (1.0 + numpy.exp(-input.T))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))() sinput = tensor.Tensor(
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input') dtype='float32', broadcastable=(0,) * len(shape))()
times = compare_fns( shared_input = tcn.shared_constructor(
dict( numpy=numpy_sigmoid numpy.random.rand(*shape),
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))) 'shared_input')
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + times = compare_fns(dict(
tensor.exp(-shared_input.T)))]) numpy=numpy_sigmoid,
), theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))),
theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input.T)))])),
input=shared_input.value) input=shared_input.value)
showtimes(times) showtimes(times)
if __name__ == '__main__': if __name__ == '__main__':
eval(sys.argv[1]) eval(sys.argv[1])
# cmp_sigmoids((640, 64*64)) # looks great in profiler # cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49)) # cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49)) # cmp_sigmoids_T((173, 74*49))
...@@ -259,8 +259,8 @@ class CudaNdarrayType(Type): ...@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64': (complex, 'theano_complex64', 'complex64': (complex, 'theano_complex64',
'NPY_COMPLEX64')}[self.dtype] 'NPY_COMPLEX64')}[self.dtype]
except KeyError: except KeyError:
raise TypeError("Unsupported dtype for %s: %s" % ( raise TypeError("Unsupported dtype for %s: %s" %
self.__class__.__name__, self.dtype)) (self.__class__.__name__, self.dtype))
def __eq__(self, other): def __eq__(self, other):
""" """
...@@ -271,9 +271,10 @@ class CudaNdarrayType(Type): ...@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
other.broadcastable == self.broadcastable) other.broadcastable == self.broadcastable)
def convert_variable(self, var): def convert_variable(self, var):
if (type(self) == type(var.type) and if (isinstance(self, type(var.type)) and
self.ndim == var.type.ndim and self.ndim == var.type.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable, all(sb == ob or ob for sb, ob in zip(
self.broadcastable,
var.type.broadcastable))): var.type.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable) return theano.tensor.patternbroadcast(var, self.broadcastable)
...@@ -312,7 +313,7 @@ class CudaNdarrayType(Type): ...@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return self.name return self.name
else: else:
b = self.broadcastable b = self.broadcastable
#bcast = str(self.broadcastable) # bcast = str(self.broadcastable)
if not numpy.any(b): if not numpy.any(b):
s = "%iD" % len(b) s = "%iD" % len(b)
else: else:
...@@ -327,7 +328,7 @@ class CudaNdarrayType(Type): ...@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def __repr__(self): def __repr__(self):
return str(self) return str(self)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable)) # "CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def c_declare(self, name, sub, check_input=True): def c_declare(self, name, sub, check_input=True):
return """ CudaNdarray * %(name)s;""" % locals() return """ CudaNdarray * %(name)s;""" % locals()
...@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code( ...@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
CudaNdarray_HOST_DIMS(%(oname)s)[i]) { CudaNdarray_HOST_DIMS(%(oname)s)[i]) {
alloc = true; alloc = true;
break; break;
} }}
}
if(alloc) { if(alloc) {
Py_XDECREF(%(oname)s); Py_XDECREF(%(oname)s);
%(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s); %(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s);
...@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code( ...@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)s; %(fail)s;
} }
} }
""", """, version=3)
version=3)
# THIS WORKS But CudaNdarray instances don't compare equal to one # THIS WORKS But CudaNdarray instances don't compare equal to one
...@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda): ...@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported. # In case cuda is not imported.
if cuda is not None: if cuda is not None:
copyreg.pickle(cuda.CudaNdarray, CudaNdarray_pickler, copyreg.pickle(
CudaNdarray_unpickler) cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
...@@ -13,7 +13,7 @@ try: ...@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc # We must do those import to be able to create the full doc when nvcc
# is not available # is not available
from theano.sandbox.cuda import filter as type_support_filter from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost from theano.sandbox.cuda.basic_ops import HostFromGpu
except ImportError: except ImportError:
pass pass
...@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators): ...@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def _as_TensorVariable(self): def _as_TensorVariable(self):
return HostFromGpu()(self) return HostFromGpu()(self)
def _as_CudaNdarrayVariable(self): def _as_CudaNdarrayVariable(self):
return self return self
...@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature): ...@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class CudaNdarrayConstant(_operators, Constant): class CudaNdarrayConstant(_operators, Constant):
def signature(self): def signature(self):
return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data))) return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data)))
def __str__(self): def __str__(self):
if self.name is not None: if self.name is not None:
return self.name return self.name
...@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant): ...@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data = str(numpy.asarray(self.data)) data = str(numpy.asarray(self.data))
except Exception as e: except Exception as e:
data = "error while transferring the value: " + str(e) data = "error while transferring the value: " + str(e)
return "CudaNdarrayConstant{"+data+"}" return "CudaNdarrayConstant{" + data + "}"
CudaNdarrayType.Constant = CudaNdarrayConstant CudaNdarrayType.Constant = CudaNdarrayConstant
......
...@@ -87,42 +87,8 @@ whitelist_flake8 = [ ...@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py", "sandbox/tests/test_theano_object.py",
"sandbox/tests/test_scan.py", "sandbox/tests/test_scan.py",
"sandbox/tests/__init__.py", "sandbox/tests/__init__.py",
"sandbox/cuda/var.py",
"sandbox/cuda/GpuConvGrad3D.py",
"sandbox/cuda/basic_ops.py",
"sandbox/cuda/nnet.py",
"sandbox/cuda/elemwise.py",
"sandbox/cuda/type.py",
"sandbox/cuda/__init__.py", "sandbox/cuda/__init__.py",
"sandbox/cuda/opt.py",
"sandbox/cuda/blas.py",
"sandbox/cuda/blocksparse.py",
"sandbox/cuda/rng_curand.py",
"sandbox/cuda/fftconv.py",
"sandbox/cuda/kernel_codegen.py",
"sandbox/cuda/GpuConvTransp3D.py",
"sandbox/cuda/nvcc_compiler.py",
"sandbox/cuda/neighbours.py",
"sandbox/cuda/tests/__init__.py", "sandbox/cuda/tests/__init__.py",
"sandbox/cuda/tests/walltime.py",
"sandbox/cuda/tests/test_gradient.py",
"sandbox/cuda/tests/test_neighbours.py",
"sandbox/cuda/tests/test_conv_cuda_ndarray.py",
"sandbox/cuda/tests/test_var.py",
"sandbox/cuda/tests/test_opt.py",
"sandbox/cuda/tests/test_blas.py",
"sandbox/cuda/tests/test_driver.py",
"sandbox/cuda/tests/test_rng_curand.py",
"sandbox/cuda/tests/test_basic_ops.py",
"sandbox/cuda/tests/test_memory.py",
"sandbox/cuda/tests/test_mlp.py",
"sandbox/cuda/tests/test_bench_loopfusion.py",
"sandbox/cuda/tests/test_blocksparse.py",
"sandbox/cuda/tests/test_cuda_ndarray.py",
"sandbox/cuda/tests/test_tensor_op.py",
"sandbox/cuda/tests/test_extra_ops.py",
"sandbox/cuda/tests/test_gemmcorr3d.py",
"sandbox/cuda/tests/test_viewop.py",
"sandbox/gpuarray/tests/__init__.py", "sandbox/gpuarray/tests/__init__.py",
"sandbox/scan_module/scan_utils.py", "sandbox/scan_module/scan_utils.py",
"sandbox/scan_module/scan.py", "sandbox/scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论