提交 b69ad54d authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py
......@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5
broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()])
......@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype)
......@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth):
for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m]
dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW
......@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
dCdW = outputs[0]
codeSource = """
codeSource = """
///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n");
......@@ -285,7 +284,7 @@ if(!work_complete){
# This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position.
codeSource = """
codeSource = """
__global__ void
//thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
......
......@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else:
RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(dtype=H_.dtype,
broadcastable=(False,)*5)()])
return theano.Apply(
self, inputs=[W_, b_, d_, H_, RShape_],
outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs
......@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert dc > 0
assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur
videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight:
......@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else:
# print "No Rshape passed in"
# print "video size: "+str((videoHeight, videoWidth, videoDur))
# print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype)
R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels):
# print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels)
# print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight):
# print '\t\t\trow '+str(r+1)+'/'+str(videoHeight)
# print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth):
for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur + 1 )/float(dt))) ])
fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ])
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ])
ftc = max(
[0,
int(numpy.ceil(
float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight:
rk = r - rc * dr
if rk < 0:
......@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if tk < 0:
break
R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] )
R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1
"" # close loop over tc
......
......@@ -5,9 +5,9 @@ import numpy as np
import theano
import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse
from theano.misc.pycuda_init import pycuda_available
if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
......@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
o1 = i1 + 1
input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
input)
input)
else:
o1 = i1
input_padded = input
......@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else:
raise ValueError('invalid mode')
......@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape)
#output_fft_s = input_fft_v
# output_fft_s = input_fft_v
# reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
......@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here
# how much values are expected.
if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)]
output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else:
raise ValueError('invalid mode')
#output = output_circ[:, :, :, :, :]
# output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT.
......
......@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
rest of the buffer is trashed by this function.
Notes
-----
-----
buf should be in gpu shared memory, we access it many times.
"""
......@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop.
"""
return [
# get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]',
'}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum',
'}',
'__syncthreads()',
]
return [ # get max of buf (trashing all but buf[0])
inline_reduce_max(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_max = ' + buf + '[0]',
'__syncthreads()',
'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
threadCount + '){',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf2 + '[__i] = ' + buf + '[__i]', '}',
'__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()',
'float row_sum = ' + buf + '[0]',
'__syncthreads()',
# divide each exp() result by the sum to complete the job.
'for(int __i=' + threadPos + '; __i<' + N +
'; __i+=' + threadCount + '){',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'__syncthreads()',
]
@code_version((1,))
......@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos),
"%s[i]" % buf)
loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......
from __future__ import absolute_import, print_function, division
# This is work in progress
from theano import Op, Apply, tensor
from theano import Apply, tensor
from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp
......
......@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
""" % locals()
def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x', 'blockDim.x'),
"__syncthreads()",
"}",
])
ret1 = nvcc_kernel(
"kSoftmax_%s" % nodename,
params=['int M', 'int N',
'const float * x',
'const int sx0',
'const int sx1',
'float * sm',
'const int sm_s0',
'const int sm_s1'],
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf2[tx] = buf[tx]", "}", "__syncthreads()",
inline_softmax('N',
'buf',
'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
# This set all value correctly
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel(
"kSoftmax_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x', 'const int sx0', 'const int sx1',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=["extern __shared__ float buf[]",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"const float *x_ptr = &x[blockIDX * sx0]",
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'threadIdx.x',
'blockDim.x'),
"__syncthreads()", "}", ])
return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax()
......@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[
"extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] += b[tx * sb0]",
"buf2[tx] = buf[tx]",
"}",
"__syncthreads()",
inline_softmax('N', 'buf', 'buf2',
'threadIdx.x', 'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
body=["extern __shared__ float buf[]",
"float * buf2 = buf + N",
"for (int blockIDX = blockIdx.x; blockIDX < M;"
" blockIDX += gridDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] += b[tx * sb0]",
"buf2[tx] = buf[tx]", "}",
"__syncthreads()", inline_softmax('N', 'buf', 'buf2',
'threadIdx.x',
'blockDim.x'),
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"__syncthreads()", "}", ])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N',
'const float * x',
......@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1',
'sm_ptr', 'sm_s1',
'sm_ptr',
'sm_s1',
'threadIdx.x',
'blockDim.x',
'b', 'sb0'),
......
......@@ -4,7 +4,6 @@ import logging
import os
import subprocess
import sys
import warnings
from locale import getpreferredencoding
import numpy
......@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
_logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code)
lib_filename = os.path.join(location, '%s.%s' %
(module_name, get_lib_extension()))
lib_filename = os.path.join(
location, '%s.%s' %
(module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename)
# TODO: Why do these args cause failure on gtx285 that has 1.3
......@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
continue
for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
'-L', '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math',
'--use-local-env', '--cl-version=']:
......@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
# https://wiki.debian.org/RpathIssue for details.
if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))):
os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib'))
if sys.platform != 'darwin':
......@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u
except ValueError as e:
except ValueError:
done = True
# CUDA Toolkit v4.1 Known Issues:
......@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
try:
os.chdir(location)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally:
os.chdir(orig_dir)
......
差异被折叠。
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from __future__ import absolute_import, print_function, division
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
import numpy
import theano.gof
from theano.compat import PY3
......@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb
from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config
class CURAND_Base(GpuOp):
"""
"""
Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by
......@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op.
"""
return (
self.destructive,
return (self.destructive,
self.output_type,
self.seed,
)
......@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
def make_node(self, generator, size):
return theano.gof.Apply(self, [generator, size],
[generator.type(), self.output_type()])
[generator.type(), self.output_type()])
@classmethod
def new_auto_update(cls, generator, ndim, dtype, size, seed):
......@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
v_size = theano.tensor.as_tensor_variable(size)
if ndim is None:
ndim = get_vector_length(v_size)
self = cls(
output_type=CudaNdarrayType((False,) * ndim),
seed=seed,
destructive=False)
self = cls(output_type=CudaNdarrayType((False,) * ndim),
seed=seed,
destructive=False)
o_gen, sample = self(generator, cast(v_size, 'int32'))
......@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable.
Parameters
----------
seed : int
......@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
return rval
def uniform(self, size, low=0.0, high=1.0, ndim=None,
dtype=config.floatX):
dtype=config.floatX):
"""
Return symbolic tensor of uniform numbers.
......@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg
for i in size]), msg
else:
msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed())
self.next_seed())
self.state_updates.append(u.update)
rval = u * (high - low) + low
if u.type.broadcastable != rval.type.broadcastable:
......@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of '
'low and `high` arguments'
)
return rval
return rval
def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
dtype=config.floatX):
dtype=config.floatX):
"""
Return symbolic tensor of normally-distributed numbers.
......@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg
for i in size]), msg
else:
msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed())
self.next_seed())
self.state_updates.append(u.update)
rval = u * std + avg
if u.type.broadcastable != rval.type.broadcastable:
......@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments'
)
return rval
return rval
@local_optimizer([CURAND_Base])
......@@ -386,5 +383,5 @@ def local_destructive(node):
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
'inplace')
opt.in2out(local_destructive, ignore_newtrees=True),
99, 'fast_run', 'inplace')
......@@ -6,7 +6,7 @@ import theano
try:
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
except ImportError:
# To have the GPU back-end work without nose, we need this file to
......@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo = f.maker.fgraph.toposort()
assert len(topo) == 2
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' +
'but got:']+[str(app) for app in topo])
msg = '\n\t'.join(
['Expected exactly one occurrence of GpuCAReduce ' +
'but got:'] + [str(app) for app in topo])
raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS "
......
......@@ -5,24 +5,22 @@ import itertools
from nose.plugins.skip import SkipTest
import numpy as np
from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False:
if cuda_ndarray.cuda_available:
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
else:
raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu
......@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32")
a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,),
a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a))
......@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks
a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks
a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1
a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum
a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self):
......@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2
a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1
a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1
a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32")
if axis is None:
# Avoid floating point error
a = np.sign(a-0.5).astype("float32")
a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2
a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error
a = np.sign(a - 0.5).astype(
"float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self):
......
from __future__ import absolute_import, print_function, division
import unittest
import numpy
import copy
import theano
from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from theano.sandbox.cuda.basic_ops import gpu_contiguous
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs = shared(inputs_val)
filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample,
conv = theano.tensor.nnet.convTransp3D(W=filters,
b=bias,
d=subsample,
H=inputs)
f_ref = theano.function([], conv)
res_ref = f_ref()
......
......@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......
......@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest
if cuda.cuda_available == False:
if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
return ("(n malloc/theano mem allocated in KB)",
n_mallocs + extra_alloc,
int(theano_alloc / 1024) + extra_size)
int(theano_alloc / 1024))
return ("n malloc on the gpu", n_mallocs + extra_alloc)
# I don't use the following by default as if there is other stuff running
......@@ -83,9 +83,12 @@ def test_memory():
variables = cuda.shared_constructor(np.ones((shapes[1],),
dtype='float32'))
derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
print("Shared took ", np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB")
print("Shared took ",
np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem()
print("Before compilation", mem2)
......@@ -112,7 +115,7 @@ def test_memory():
del obj
# print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem())
# assert mem2 == freemem(), (mem2, freemem())
del grad
print("After deleting function 2", freemem())
......@@ -155,16 +158,19 @@ def test_memory_lazy():
derp = ifelse.IfElse(1)(branch_select,
derp, some_matrix[:shapes[0]].sum())
derp += 1
print("Shared took ", np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) * 4 / 1024, "kB")
print("Shared took ",
np.prod(variables.get_value(
borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem()
print("Before compilation", mem2)
mem2_1 = freemem(extra_alloc=more_alloc1)
obj = theano.function([some_vector, branch_select], derp,
mode=mode_with_gpu)
#theano.printing.debugprint(obj, print_type=True)
# theano.printing.debugprint(obj, print_type=True)
mem3 = freemem()
print("After function compilation 1", mem3)
assert mem2_1 == mem3, (mem2_1, mem3)
......
......@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!')
# Skip test if cuda_ndarray is not available.
if tcn.cuda_available == False:
if tcn.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
......@@ -68,7 +68,7 @@ def print_mode(mode):
def print_diff_mode(a, b):
if (a is not None and
isinstance(a, (theano.compile.ProfileMode,)) and
isinstance(b, (theano.compile.ProfileMode,))):
isinstance(b, (theano.compile.ProfileMode,))):
a.print_diff_summary(b)
......@@ -138,8 +138,8 @@ def test_run_nnet():
# print "cpu:", rval_cpu
# print "gpu:", rval_gpu
abs_diff, rel_diff = \
theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
rval_cpu)
theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
rval_cpu)
max_abs_diff = abs_diff.max()
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# max_abs_diff, rel_diff.max(), n_in, n_hid)
......@@ -147,19 +147,20 @@ def test_run_nnet():
rtol = 1e-4
if n_in * n_hid >= 2048 * 4096:
rtol = 7e-4
assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid)
assert numpy.allclose(
rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid)
def test_run_nnet_med():
utt.seed_rng()
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000)
run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small():
utt.seed_rng()
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000)
run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu):
......@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode = get_mode(use_gpu)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
......@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1)
logical_hid_shape[1] // 2),
shape_kern1[2:],
n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
......@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode = get_mode(use_gpu)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n
......@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
else:
hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')
))[:, :, ::2, ::2])
hid = tensor.tanh(
(conv_op(x, w0) + b0.dimshuffle(
(0, 'x', 'x')))[:, :, ::2, ::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out,
tensor.argmax(y, axis=1)) * lr)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
out, tensor.argmax(y, axis=1)) * lr)
# print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c]
......@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode = get_mode(use_gpu, check_isfinite)
# print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p,
g in zip(params, gparams)])
train = pfunc(
[x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
if verbose:
theano.printing.debugprint(train)
......@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
utt.seed_rng(seed) # Seeds numpy.random with seed
train, params, x_shape, y_shape, mode = build_conv_nnet2_classif(
use_gpu=use_gpu,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
use_gpu=use_gpu,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
if use_gpu:
device = 'GPU'
......@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr = theano._asarray(0.01, dtype='float32')
rvals = my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode)
if pickle and isinstance(mode, theano.compile.ProfileMode):
......@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare = True
if not compare:
return run_conv_nnet2_classif(use_gpu=use_gpu,
seed=seed, isize=isize, ksize=ksize, bsize=bsize,
n_train=n_train,
check_isfinite=check_isfinite,
pickle=pickle,
verbose=verbose,
version=version)
return run_conv_nnet2_classif(
use_gpu=use_gpu,
seed=seed, isize=isize, ksize=ksize, bsize=bsize,
n_train=n_train,
check_isfinite=check_isfinite,
pickle=pickle,
verbose=verbose,
version=version)
utt.seed_rng(seed) # Seeds numpy.random with seed
train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \
build_conv_nnet2_classif(
use_gpu=False,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
build_conv_nnet2_classif(
use_gpu=False,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
utt.seed_rng(seed) # Seeds numpy.random with seed
train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \
build_conv_nnet2_classif(
use_gpu=True,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
build_conv_nnet2_classif(
use_gpu=True,
isize=isize,
ksize=ksize,
n_batch=bsize,
verbose=verbose,
version=version,
check_isfinite=check_isfinite)
assert x_shape == x_shape_gpu
assert y_shape == y_shape_gpu
......@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally:
theano.tensor.basic.float32_atol = orig_float32_atol
if pickle:
if isinstance(cpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN CPU profile mode dump")
print(pickle.dumps(cpu_mode))
print("END CPU profile mode dump")
if isinstance(gpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN GPU profile mode dump")
print(pickle.dumps(gpu_mode))
print("END GPU profile mode dump")
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % (
......
# Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest
import unittest
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled')
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论