提交 b69ad54d authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py
......@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5
broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
......@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype)
......@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth):
for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m]
dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW
......
......@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else:
RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()])
return theano.Apply(
self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs
......@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
......@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# print "No Rshape passed in"
# print "video size: "+str((videoHeight, videoWidth, videoDur))
# print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype)
R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels):
# print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
# print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight):
# print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
# print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth):
for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur + 1 )/float(dt))) ])
fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ])
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
ftc = max(
[0,
int(numpy.ceil(
float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
......@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if tk < 0:
break
R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] )
R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1
"" # close loop over tc
......
from __future__ import absolute_import, print_function, division
import copy
import os
import logging
_logger = logging.getLogger(__name__)
from six import integer_types
from six.moves import StringIO, reduce
import theano
from theano import Apply
from theano import tensor
......@@ -15,6 +11,7 @@ from theano.sandbox.cuda import GpuOp
from theano.sandbox.cuda.basic_ops import (as_cuda_ndarray_variable,
gpu_contiguous)
from theano.tensor import as_tensor_variable
_logger = logging.getLogger(__name__)
class GpuBatchedDot(GpuOp):
......@@ -183,8 +180,7 @@ class GpuBatchedDot(GpuOp):
}
} else {
// copy inputs if not contiguous
""" +
("\n".join("""
""" + ("\n".join("""
if (( CudaNdarray_HOST_DIMS(%(var)s)[0] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[0] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[1] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[1] != 1
&& CudaNdarray_HOST_DIMS(%(var)s)[2] > 1 && CudaNdarray_HOST_STRIDES(%(var)s)[2] != 1)
......@@ -198,8 +194,7 @@ class GpuBatchedDot(GpuOp):
Py_XDECREF(%(var)s);
%(var)s = _copy;
}
""" % dict(var=var, fail=fail) for var in (bx, by)))
+ """
""" % dict(var=var, fail=fail) for var in (bx, by))) + """
// fail if the output is not contiguous; we can't copy it because we
// need to write to the original memory
......@@ -537,8 +532,8 @@ class GpuGemm(GpuOp):
return 'GpuGemm{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
return (type(self) == type(other) and
self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
......@@ -562,7 +557,7 @@ class GpuGemm(GpuOp):
return (4,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
# z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
......@@ -657,8 +652,8 @@ class GpuGemv(GpuOp):
return 'GpuGemv{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
return (type(self) == type(other) and
self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
......@@ -682,7 +677,7 @@ class GpuGemv(GpuOp):
return (3,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
# z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in, a, x, y, b = inputs
......@@ -757,8 +752,8 @@ class GpuGer(GpuOp):
return 'GpuGer{no_inplace}'
def __eq__(self, other):
return (type(self) == type(other)\
and self.inplace == other.inplace)
return (type(self) == type(other) and
self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.inplace)
......@@ -782,7 +777,7 @@ class GpuGer(GpuOp):
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
#z_out = alpha * dot(x,y) + beta * z_in
# z_out = alpha * dot(x,y) + beta * z_in
# inplace version, set set z_out = z_in
# not inplace version, we copy z_in to z_out.
z_in, a, x, y = inputs
......@@ -1283,11 +1278,15 @@ class GpuCorrMM_gradWeights(BaseGpuCorrMM):
bottom, top = inp[:2]
weights, = grads
weights = gpu_contiguous(weights)
d_bottom = GpuCorrMM_gradInputs(self.border_mode, self.subsample)(
weights, top, bottom.shape[-2:])
d_top = GpuCorrMM(self.border_mode, self.subsample)(
bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
d_bottom = GpuCorrMM_gradInputs(
self.border_mode, self.subsample)(weights,
top,
bottom.shape[-2:])
d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_bottom, d_top) + d_height_width
def connection_pattern(self, node):
......@@ -1342,11 +1341,14 @@ class GpuCorrMM_gradInputs(BaseGpuCorrMM):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorrMM_gradWeights(self.border_mode, self.subsample)(
d_weights = GpuCorrMM_gradWeights(
self.border_mode, self.subsample)(
bottom, top, weights.shape[-2:])
d_top = GpuCorrMM(self.border_mode, self.subsample)(
bottom, weights)
d_height_width = (theano.gradient.DisconnectedType()(),) * 2 if len(inp) == 4 else ()
d_top = GpuCorrMM(
self.border_mode, self.subsample)(bottom, weights)
d_height_width = (
theano.gradient.DisconnectedType()(),
) * 2 if len(inp) == 4 else ()
return (d_weights, d_top) + d_height_width
def connection_pattern(self, node):
......@@ -1755,10 +1757,16 @@ class GpuCorr3dMM(BaseGpuCorr3dMM):
bottom, weights = inp
top, = grads
top = gpu_contiguous(top)
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode, self.subsample, self.pad)(
weights, top, bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-3:])
d_bottom = GpuCorr3dMM_gradInputs(self.border_mode,
self.subsample,
self.pad)(weights,
top,
bottom.shape[-3:])
d_weights = GpuCorr3dMM_gradWeights(self.border_mode,
self.subsample,
self.pad)(bottom,
top,
weights.shape[-3:])
return d_bottom, d_weights
......@@ -1863,11 +1871,14 @@ class GpuCorr3dMM_gradInputs(BaseGpuCorr3dMM):
weights, top = inp[:2]
bottom, = grads
bottom = gpu_contiguous(bottom)
d_weights = GpuCorr3dMM_gradWeights(self.border_mode, self.subsample, self.pad)(
d_weights = GpuCorr3dMM_gradWeights(
self.border_mode, self.subsample, self.pad)(
bottom, top, weights.shape[-3:])
d_top = GpuCorr3dMM(self.border_mode, self.subsample, self.pad)(
d_top = GpuCorr3dMM(
self.border_mode, self.subsample, self.pad)(
bottom, weights)
d_height_width_depth = (theano.gradient.DisconnectedType()(),) * 3 if len(inp) == 5 else ()
d_height_width_depth = (theano.gradient.DisconnectedType()(),)\
* 3 if len(inp) == 5 else ()
return (d_weights, d_top) + d_height_width_depth
def connection_pattern(self, node):
......@@ -2186,7 +2197,7 @@ class GpuDownsampleFactorMax(GpuOp):
return Apply(self, [x], [x.type()])
# def perform(self, node, input_storage, output_storage):
#raise NotImplementedError('only C is implemented')
# raise NotImplementedError('only C is implemented')
def c_code_cache_version(self):
return (6)
......
......@@ -5,9 +5,9 @@ import numpy as np
import theano
import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse
from theano.misc.pycuda_init import pycuda_available
if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
......@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else:
raise ValueError('invalid mode')
......@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape)
#output_fft_s = input_fft_v
# output_fft_s = input_fft_v
# reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
......@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else:
raise ValueError('invalid mode')
#output = output_circ[:, :, :, :, :]
# output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
......
......@@ -167,17 +167,15 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
"""
return [
# get max of buf (trashing all but buf[0])
return [ # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
buf2 + '[__i] = ' + buf + '[__i]', '}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
......@@ -186,8 +184,7 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'__syncthreads()',
]
......@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......
from __future__ import absolute_import, print_function, division
# This is work in progress
from theano import Op, Apply, tensor
from theano import Apply, tensor
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
......
......@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
""" % locals()
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
ret1 = nvcc_kernel(
"kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
'const float * x',
'const int sx0',
'const int sx1',
'float * sm',
'const int sm_s0',
'const int sm_s1'],
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'),
"buf2[tx] = buf[tx]", "}", "__syncthreads()",
inline_softmax('N',
'buf',
'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel(
"kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
body=["extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'),
"__syncthreads()",
"}",
])
'threadIdx.x',
'blockDim.x'),
"__syncthreads()", "}", ])
return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax()
......@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] += b[tx * sb0]",
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'),
"buf2[tx] = buf[tx]", "}",
"__syncthreads()", inline_softmax('N', 'buf', 'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x',
......@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'sm_ptr',
'sm_s1',
'threadIdx.x',
'blockDim.x',
'b', 'sb0'),
......
......@@ -4,7 +4,6 @@ import logging
import os
import subprocess
import sys
import warnings
from locale import getpreferredencoding
import numpy
......@@ -249,7 +248,8 @@ class NVCC_compiler(Compiler):
_logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code)
lib_filename = os.path.join(location, '%s.%s' %
lib_filename = os.path.join(
location, '%s.%s' %
(module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename)
......@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u
except ValueError as e:
except ValueError:
done = True
# CUDA Toolkit v4.1 Known Issues:
......@@ -364,6 +364,8 @@ class NVCC_compiler(Compiler):
console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally:
os.chdir(orig_dir)
......
差异被折叠。
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from __future__ import absolute_import, print_function, division
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
import numpy
import theano.gof
from theano.compat import PY3
......@@ -17,6 +7,15 @@ from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb
from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config
......@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
"""
return (
self.destructive,
return (self.destructive,
self.output_type,
self.seed,
)
......@@ -101,8 +99,7 @@ class CURAND_Base(GpuOp):
v_size = theano.tensor.as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
self = cls(
output_type=CudaNdarrayType((False,) * ndim),
self = cls(output_type=CudaNdarrayType((False,) * ndim),
seed=seed,
destructive=False)
......@@ -386,5 +383,5 @@ def local_destructive(node):
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
'inplace')
opt.in2out(local_destructive, ignore_newtrees=True),
99, 'fast_run', 'inplace')
......@@ -6,7 +6,7 @@ import theano
try:
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
except ImportError:
# To have the GPU back-end work without nose, we need this file to
......@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
'but got:']+[str(app) for app in topo])
msg = '\n\t'.join(
['Expected exactly one occurrence of GpuCAReduce ' +
'but got:'] + [str(app) for app in topo])
raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS "
......
......@@ -5,24 +5,22 @@ import itertools
from nose.plugins.skip import SkipTest
import numpy as np
from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
if cuda_ndarray.cuda_available:
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
else:
raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu
......@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32")
a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,),
a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a))
......@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1
a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum
a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
......@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
a = np.sign(a - 0.5).astype(
"float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self):
......
from __future__ import absolute_import, print_function, division
import unittest
import numpy
import copy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from theano.sandbox.cuda.basic_ops import gpu_contiguous
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
conv = theano.tensor.nnet.convTransp3D(W=filters,
b=bias,
d=subsample,
H=inputs)
f_ref = theano.function([], conv)
res_ref = f_ref()
......
......@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......
......@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
if cuda.cuda_available == False:
if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
return ("(n malloc/theano mem allocated in KB)",
n_mallocs + extra_alloc,
int(theano_alloc / 1024) + extra_size)
int(theano_alloc / 1024))
return ("n malloc on the gpu", n_mallocs + extra_alloc)
# I don't use the following by default as if there is other stuff running
......@@ -83,9 +83,12 @@ def test_memory():
variables = cuda.shared_constructor(np.ones((shapes[1],),
dtype='float32'))
derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
print("Shared took ", np.prod(variables.get_value(
print("Shared took ",
np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB")
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem()
print("Before compilation", mem2)
......@@ -112,7 +115,7 @@ def test_memory():
del obj
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
# assert mem2 == freemem(), (mem2, freemem())
del grad
print("After deleting function 2", freemem())
......@@ -155,16 +158,19 @@ def test_memory_lazy():
derp = ifelse.IfElse(1)(branch_select,
derp, some_matrix[:shapes[0]].sum())
derp += 1
print("Shared took ", np.prod(variables.get_value(
print("Shared took ",
np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB")
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem()
print("Before compilation", mem2)
mem2_1 = freemem(extra_alloc=more_alloc1)
obj = theano.function([some_vector, branch_select], derp,
mode=mode_with_gpu)
#theano.printing.debugprint(obj, print_type=True)
# theano.printing.debugprint(obj, print_type=True)
mem3 = freemem()
print("After function compilation 1", mem3)
assert mem2_1 == mem3, (mem2_1, mem3)
......
......@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!')
# Skip test if cuda_ndarray is not available.
if tcn.cuda_available == False:
if tcn.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -147,19 +147,20 @@ def test_run_nnet():
rtol = 1e-4
if n_in * n_hid >= 2048 * 4096:
rtol = 7e-4
assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
assert numpy.allclose(
rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid)
def test_run_nnet_med():
utt.seed_rng()
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
utt.seed_rng()
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
......@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode = get_mode(use_gpu)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
......@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
logical_hid_shape[1] // 2),
shape_kern1[2:],
n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
......@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode = get_mode(use_gpu)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
......@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
else:
hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')
))[:, :, ::2, ::2])
hid = tensor.tanh(
(conv_op(x, w0) + b0.dimshuffle(
(0, 'x', 'x')))[:, :, ::2, ::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out,
tensor.argmax(y, axis=1)) * lr)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
out, tensor.argmax(y, axis=1)) * lr)
# print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
......@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode = get_mode(use_gpu, check_isfinite)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
if verbose:
theano.printing.debugprint(train)
......@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr = theano._asarray(0.01, dtype='float32')
rvals = my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
if pickle and isinstance(mode, theano.compile.ProfileMode):
......@@ -495,7 +506,8 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare = True
if not compare:
return run_conv_nnet2_classif(use_gpu=use_gpu,
return run_conv_nnet2_classif(
use_gpu=use_gpu,
seed=seed, isize=isize, ksize=ksize, bsize=bsize,
n_train=n_train,
check_isfinite=check_isfinite,
......@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally:
theano.tensor.basic.float32_atol = orig_float32_atol
if pickle:
if isinstance(cpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN CPU profile mode dump")
print(pickle.dumps(cpu_mode))
print("END CPU profile mode dump")
if isinstance(gpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN GPU profile mode dump")
print(pickle.dumps(gpu_mode))
print("END GPU profile mode dump")
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
......
# Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
import unittest
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......
......@@ -8,7 +8,7 @@ from theano.sandbox.rng_mrg import MRG_RandomStreams
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
# The PyCObject that represents the cuda random stream object
......
......@@ -2,7 +2,6 @@
This file test tensor op that should also operate on CudaNdaray.
"""
from __future__ import absolute_import, print_function, division
import copy
from nose.plugins.skip import SkipTest
import numpy
......@@ -14,7 +13,7 @@ import theano.tensor as T
# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda
from theano.tensor.nnet.tests import test_conv3d2d
if cuda.cuda_available == False:
if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -57,7 +56,7 @@ def test_softmax_optimizations():
one_of_n = tensor.lvector('one_of_n')
op = crossentropy_categorical_1hot
xe = op(x, one_of_n)
op(x, one_of_n)
fgraph = theano.gof.FunctionGraph(
[x, one_of_n],
......@@ -84,10 +83,10 @@ def test_may_share_memory_cuda():
# can't test the transpose as ta._strides = is not implemented
# manual transpose of a
#ta = a.reshape((4,3))
# ta = a.reshape((4,3))
# ta._strides = (ta._strides[1],ta._strides[0])#not implemented
#elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
#ta.gpudata += ta.size*elem_size
# elem_size=elem_size = numpy.zeros(0,dtype=a.dtype).dtype.itemsize
# ta.gpudata += ta.size*elem_size
for a_, b_, rep in [(a, a, True), (b, b, True), (a, b, False),
(a, na, False), (b, nb, False),
......@@ -95,8 +94,7 @@ def test_may_share_memory_cuda():
(a, va, True), (b, vb, True),
(va, b, False), (a, vb, False),
(a, ra, True), (b, rb, True),
(ra, b, False), (a, rb, False),
]:
(ra, b, False), (a, rb, False), ]:
assert may_share_memory(a_, b_) == rep
assert may_share_memory(b_, a_) == rep
......
......@@ -10,7 +10,7 @@ from theano.sandbox.cuda.var import float32_shared_constructor as f32sc
from theano.sandbox.cuda import CudaNdarrayType, cuda_available
import theano.sandbox.cuda as cuda
# Skip test if cuda_ndarray is not available.
if cuda_available == False:
if cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -26,19 +26,18 @@ def test_float32_shared_constructor():
# test that broadcastable arg is accepted, and that they
# don't strictly have to be tuples
assert eq(
f32sc(npy_row, broadcastable=(True, False)).type,
assert eq(f32sc(npy_row,
broadcastable=(True, False)).type,
CudaNdarrayType((True, False)))
assert eq(
f32sc(npy_row, broadcastable=[True, False]).type,
assert eq(f32sc(npy_row,
broadcastable=[True, False]).type,
CudaNdarrayType((True, False)))
assert eq(
f32sc(npy_row, broadcastable=numpy.array([True, False])).type,
assert eq(f32sc(npy_row,
broadcastable=numpy.array([True, False])).type,
CudaNdarrayType([True, False]))
# test that we can make non-matrix shared vars
assert eq(
f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
assert eq(f32sc(numpy.zeros((2, 3, 4, 5), dtype='float32')).type,
CudaNdarrayType((False,) * 4))
......@@ -77,7 +76,8 @@ class T_updates(unittest.TestCase):
x = tensor.fmatrix('x')
output_updates = [(output_var, x ** 2)]
output_givens = {x: data}
output_func = theano.function(inputs=[], outputs=[],
output_func = theano.function(
inputs=[], outputs=[],
updates=output_updates, givens=output_givens)
output_func()
......
from __future__ import absolute_import, print_function, division
import numpy
import unittest
from nose.plugins.skip import SkipTest
import theano
......@@ -11,7 +10,7 @@ mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
def test_viewop_gpu():
from theano.sandbox import cuda
if cuda.cuda_available == False:
if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
_x = theano.tensor.fvector('x')
x = cuda.gpu_from_host(_x)
......
from __future__ import absolute_import, print_function, division
from __future__ import print_function
import sys, time
import sys
import time
from six import iteritems
from theano.compile.pfunc import pfunc
from theano import tensor
......@@ -35,35 +36,47 @@ def showtimes(times):
def cmp_sigmoids(shape):
def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
times = compare_fns(
dict( numpy=numpy_sigmoid
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput)))
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 + tensor.exp(-shared_input)))])
),
1.0 / (1.0 + numpy.exp(-input))
sinput = tensor.Tensor(
dtype='float32', broadcastable=(0,) * len(shape))()
shared_input = tcn.shared_constructor(
numpy.random.rand(*shape),
'shared_input')
times = compare_fns(dict(
numpy=numpy_sigmoid,
theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput))),
theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input)))])),
input=shared_input.value)
showtimes(times)
def cmp_sigmoids_T(shape):
def numpy_sigmoid(input):
rval = 1.0 / (1.0 + numpy.exp(-input.T))
sinput = tensor.Tensor(dtype='float32', broadcastable=(0,)*len(shape))()
shared_input = tcn.shared_constructor(numpy.random.rand(*shape), 'shared_input')
times = compare_fns(
dict( numpy=numpy_sigmoid
, theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T)))
, theano_gpu_onboard=pfunc([sinput], [], updates=[(shared_input, 1.0 / (1.0 +
tensor.exp(-shared_input.T)))])
),
1.0 / (1.0 + numpy.exp(-input.T))
sinput = tensor.Tensor(
dtype='float32', broadcastable=(0,) * len(shape))()
shared_input = tcn.shared_constructor(
numpy.random.rand(*shape),
'shared_input')
times = compare_fns(dict(
numpy=numpy_sigmoid,
theano_cpu=pfunc([sinput], 1.0 / (1.0 + tensor.exp(-sinput.T))),
theano_gpu_onboard=pfunc(
[sinput],
[],
updates=[(
shared_input,
1.0 / (1.0 + tensor.exp(-shared_input.T)))])),
input=shared_input.value)
showtimes(times)
if __name__ == '__main__':
eval(sys.argv[1])
# cmp_sigmoids((640, 64*64)) # looks great in profiler
#cmp_sigmoids((173, 74*49))
#cmp_sigmoids_T((173, 74*49))
# cmp_sigmoids((173, 74*49))
# cmp_sigmoids_T((173, 74*49))
......@@ -259,8 +259,8 @@ class CudaNdarrayType(Type):
'complex64': (complex, 'theano_complex64',
'NPY_COMPLEX64')}[self.dtype]
except KeyError:
raise TypeError("Unsupported dtype for %s: %s" % (
self.__class__.__name__, self.dtype))
raise TypeError("Unsupported dtype for %s: %s" %
(self.__class__.__name__, self.dtype))
def __eq__(self, other):
"""
......@@ -271,9 +271,10 @@ class CudaNdarrayType(Type):
other.broadcastable == self.broadcastable)
def convert_variable(self, var):
if (type(self) == type(var.type) and
if (isinstance(self, type(var.type)) and
self.ndim == var.type.ndim and
all(sb == ob or ob for sb, ob in zip(self.broadcastable,
all(sb == ob or ob for sb, ob in zip(
self.broadcastable,
var.type.broadcastable))):
return theano.tensor.patternbroadcast(var, self.broadcastable)
......@@ -312,7 +313,7 @@ class CudaNdarrayType(Type):
return self.name
else:
b = self.broadcastable
#bcast = str(self.broadcastable)
# bcast = str(self.broadcastable)
if not numpy.any(b):
s = "%iD" % len(b)
else:
......@@ -327,7 +328,7 @@ class CudaNdarrayType(Type):
def __repr__(self):
return str(self)
#"CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
# "CudaNdarrayType{%s, %s}" % (str(self.dtype), str(self.broadcastable))
def c_declare(self, name, sub, check_input=True):
return """ CudaNdarray * %(name)s;""" % locals()
......@@ -563,8 +564,7 @@ theano.compile.register_deep_copy_op_c_code(
CudaNdarray_HOST_DIMS(%(oname)s)[i]) {
alloc = true;
break;
}
}
}}
if(alloc) {
Py_XDECREF(%(oname)s);
%(oname)s = (CudaNdarray*)CudaNdarray_Copy(%(iname)s);
......@@ -581,8 +581,7 @@ theano.compile.register_deep_copy_op_c_code(
%(fail)s;
}
}
""",
version=3)
""", version=3)
# THIS WORKS But CudaNdarray instances don't compare equal to one
......@@ -608,5 +607,5 @@ def CudaNdarray_pickler(cnda):
# In case cuda is not imported.
if cuda is not None:
copyreg.pickle(cuda.CudaNdarray, CudaNdarray_pickler,
CudaNdarray_unpickler)
copyreg.pickle(
cuda.CudaNdarray, CudaNdarray_pickler, CudaNdarray_unpickler)
......@@ -13,7 +13,7 @@ try:
# We must do those import to be able to create the full doc when nvcc
# is not available
from theano.sandbox.cuda import filter as type_support_filter
from theano.sandbox.cuda.basic_ops import HostFromGpu, GpuFromHost
from theano.sandbox.cuda.basic_ops import HostFromGpu
except ImportError:
pass
......@@ -33,6 +33,7 @@ class _operators(tensor.basic._tensor_py_operators):
def _as_TensorVariable(self):
return HostFromGpu()(self)
def _as_CudaNdarrayVariable(self):
return self
......@@ -54,6 +55,7 @@ class CudaNdarrayConstantSignature(tensor.TensorConstantSignature):
class CudaNdarrayConstant(_operators, Constant):
def signature(self):
return CudaNdarrayConstantSignature((self.type, numpy.asarray(self.data)))
def __str__(self):
if self.name is not None:
return self.name
......@@ -61,7 +63,7 @@ class CudaNdarrayConstant(_operators, Constant):
data = str(numpy.asarray(self.data))
except Exception as e:
data = "error while transferring the value: " + str(e)
return "CudaNdarrayConstant{"+data+"}"
return "CudaNdarrayConstant{" + data + "}"
CudaNdarrayType.Constant = CudaNdarrayConstant
......
......@@ -87,42 +87,8 @@ whitelist_flake8 = [
"sandbox/tests/test_theano_object.py",
"sandbox/tests/test_scan.py",
"sandbox/tests/__init__.py",
"sandbox/cuda/var.py",
"sandbox/cuda/GpuConvGrad3D.py",
"sandbox/cuda/basic_ops.py",
"sandbox/cuda/nnet.py",
"sandbox/cuda/elemwise.py",
"sandbox/cuda/type.py",
"sandbox/cuda/__init__.py",
"sandbox/cuda/opt.py",
"sandbox/cuda/blas.py",
"sandbox/cuda/blocksparse.py",
"sandbox/cuda/rng_curand.py",
"sandbox/cuda/fftconv.py",
"sandbox/cuda/kernel_codegen.py",
"sandbox/cuda/GpuConvTransp3D.py",
"sandbox/cuda/nvcc_compiler.py",
"sandbox/cuda/neighbours.py",
"sandbox/cuda/tests/__init__.py",
"sandbox/cuda/tests/walltime.py",
"sandbox/cuda/tests/test_gradient.py",
"sandbox/cuda/tests/test_neighbours.py",
"sandbox/cuda/tests/test_conv_cuda_ndarray.py",
"sandbox/cuda/tests/test_var.py",
"sandbox/cuda/tests/test_opt.py",
"sandbox/cuda/tests/test_blas.py",
"sandbox/cuda/tests/test_driver.py",
"sandbox/cuda/tests/test_rng_curand.py",
"sandbox/cuda/tests/test_basic_ops.py",
"sandbox/cuda/tests/test_memory.py",
"sandbox/cuda/tests/test_mlp.py",
"sandbox/cuda/tests/test_bench_loopfusion.py",
"sandbox/cuda/tests/test_blocksparse.py",
"sandbox/cuda/tests/test_cuda_ndarray.py",
"sandbox/cuda/tests/test_tensor_op.py",
"sandbox/cuda/tests/test_extra_ops.py",
"sandbox/cuda/tests/test_gemmcorr3d.py",
"sandbox/cuda/tests/test_viewop.py",
"sandbox/gpuarray/tests/__init__.py",
"sandbox/scan_module/scan_utils.py",
"sandbox/scan_module/scan.py",
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论