提交 b69ad54d authored 作者: Xavier Bouthillier's avatar Xavier Bouthillier

Merge pull request #4244 from ChihebTrabelsi/ccw2.0

flake8 sandbox/cuda/*.py
...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp): ...@@ -39,7 +39,7 @@ class GpuConvGrad3D(GpuOp):
d_ = T.as_tensor_variable(d) d_ = T.as_tensor_variable(d)
WShape_ = T.as_tensor_variable(WShape) WShape_ = T.as_tensor_variable(WShape)
dCdH_ = as_cuda_ndarray_variable(dCdH) dCdH_ = as_cuda_ndarray_variable(dCdH)
broad = (False,)*5 broad = (False,) * 5
return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_], return theano.Apply(self, inputs=[V_, d_, WShape_, dCdH_],
outputs=[CudaNdarrayType(dtype=V_.dtype, outputs=[CudaNdarrayType(dtype=V_.dtype,
broadcastable=broad)()]) broadcastable=broad)()])
...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp): ...@@ -51,15 +51,10 @@ class GpuConvGrad3D(GpuOp):
# partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m] # partial C / partial W[j,z,k,l,m] = sum_i sum_p sum_q sum_r (partial C /partial H[i,j,p,q,r] ) * V[i,z,dr*p+k,dc*q+l,dt*r+m]
batchSize = dCdH.shape[0] batchSize = dCdH.shape[0]
outputFilters = dCdH.shape[1]
outputHeight = dCdH.shape[2] outputHeight = dCdH.shape[2]
outputWidth = dCdH.shape[3] outputWidth = dCdH.shape[3]
outputDur = dCdH.shape[4] outputDur = dCdH.shape[4]
assert V.shape[0] == batchSize assert V.shape[0] == batchSize
inputFilters = V.shape[1]
inputHeight = V.shape[2]
inputWidth = V.shape[3]
inputDur = V.shape[4]
dr, dc, dt = d dr, dc, dt = d
dCdW = numpy.zeros(WShape, dtype=V.dtype) dCdW = numpy.zeros(WShape, dtype=V.dtype)
...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp): ...@@ -76,7 +71,11 @@ class GpuConvGrad3D(GpuOp):
for p in xrange(0, outputHeight): for p in xrange(0, outputHeight):
for q in xrange(0, outputWidth): for q in xrange(0, outputWidth):
for r in xrange(0, outputDur): for r in xrange(0, outputDur):
dCdW[j, z, k, l, m] += dCdH[i, j, p, q, r] * V[i, z, dr*p+k, dc*q+l, dt*r+m] dCdW[j, z, k, l, m] += dCdH[
i, j, p, q, r] * \
V[i, z, dr * p + k,
dc * q + l,
dt * r + m]
output_storage[0][0] = dCdW output_storage[0][0] = dCdW
...@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp): ...@@ -86,7 +85,7 @@ class GpuConvGrad3D(GpuOp):
dCdW = outputs[0] dCdW = outputs[0]
codeSource = """ codeSource = """
///////////// < code generated by GpuConvGrad3D > ///////////// < code generated by GpuConvGrad3D >
//printf("\t\t\t\tGpuConvGrad3DW c code\\n"); //printf("\t\t\t\tGpuConvGrad3DW c code\\n");
...@@ -285,7 +284,7 @@ if(!work_complete){ ...@@ -285,7 +284,7 @@ if(!work_complete){
# This code is not sensitive to the ignore_border flag. # This code is not sensitive to the ignore_border flag.
# It runs for every position in the output z, and then computes the gradient for the # It runs for every position in the output z, and then computes the gradient for the
# input pixels that were downsampled to that z-position. # input pixels that were downsampled to that z-position.
codeSource = """ codeSource = """
__global__ void __global__ void
//thread block size = WShape[4] //thread block size = WShape[4]
//grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3]) //grid block size = (WShape[0]*WShape[1],WShape[2]*WShape[3])
......
...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp): ...@@ -37,9 +37,10 @@ class GpuConvTransp3D(GpuOp):
else: else:
RShape_ = T.as_tensor_variable([-1, -1, -1]) RShape_ = T.as_tensor_variable([-1, -1, -1])
return theano.Apply(self, inputs=[W_, b_, d_, H_, RShape_], return theano.Apply(
outputs=[CudaNdarrayType(dtype=H_.dtype, self, inputs=[W_, b_, d_, H_, RShape_],
broadcastable=(False,)*5)()]) outputs=[CudaNdarrayType(
dtype=H_.dtype, broadcastable=(False,) * 5)()])
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
W, b, d, H, RShape = node.inputs W, b, d, H, RShape = node.inputs
...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -382,9 +383,9 @@ def computeR(W, b, d, H, Rshape=None):
assert dc > 0 assert dc > 0
assert dt > 0 assert dt > 0
videoHeight = (outputHeight-1) * dr + filterHeight videoHeight = (outputHeight - 1) * dr + filterHeight
videoWidth = (outputWidth-1) * dc + filterWidth videoWidth = (outputWidth - 1) * dc + filterWidth
videoDur = (outputDur-1) * dt + filterDur videoDur = (outputDur - 1) * dt + filterDur
if Rshape is not None and Rshape[0] != -1: if Rshape is not None and Rshape[0] != -1:
if Rshape[0] < videoHeight: if Rshape[0] < videoHeight:
...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -399,26 +400,46 @@ def computeR(W, b, d, H, Rshape=None):
# else: # else:
# print "No Rshape passed in" # print "No Rshape passed in"
# print "video size: "+str((videoHeight, videoWidth, videoDur)) # print "video size: " + str((videoHeight, videoWidth, videoDur))
R = numpy.zeros( (batchSize, inputChannels, videoHeight, R = numpy.zeros((batchSize, inputChannels, videoHeight,
videoWidth, videoDur ) , dtype=H.dtype) videoWidth, videoDur),
dtype=H.dtype)
# R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc] # R[i,j,r,c,t] = b_j + sum_{rc,rk | d \circ rc + rk = r} sum_{cc,ck | ...} \
# sum_{tc,tk | ...} sum_k W[k, j, rk, ck, tk] * H[i,k,rc,cc,tc]
for i in xrange(0, batchSize): for i in xrange(0, batchSize):
# print '\texample '+str(i+1)+'/'+str(batchSize) # print '\texample '+str(i+1)+'/'+str(batchSize)
for j in xrange(0, inputChannels): for j in xrange(0, inputChannels):
# print '\t\tfeature map '+str(j+1)+'/'+str(inputChannels) # print '\t\tfeature map ' + str(j+1) + '/' + str(inputChannels)
for r in xrange(0, videoHeight): for r in xrange(0, videoHeight):
# print '\t\t\trow '+str(r+1)+'/'+str(videoHeight) # print '\t\t\trow ' + str(r+1) + '/'+str(videoHeight)
for c in xrange(0, videoWidth): for c in xrange(0, videoWidth):
for t in xrange(0, videoDur): for t in xrange(0, videoDur):
R[i, j, r, c, t] = b[j] R[i, j, r, c, t] = b[j]
ftc = max([0, int(numpy.ceil(float(t-filterDur + 1 )/float(dt))) ]) ftc = max(
fcc = max([0, int(numpy.ceil(float(c-filterWidth + 1)/float(dc))) ]) [0,
int(numpy.ceil(
rc = max([0, int(numpy.ceil(float(r-filterHeight+1)/float(dr))) ]) float(t - filterDur + 1) / float(dt)
))
]
)
fcc = max(
[0,
int(numpy.ceil(
float(c - filterWidth + 1) / float(dc)
))
]
)
rc = max(
[0,
int(numpy.ceil(
float(r - filterHeight + 1) / float(dr)
))
]
)
while rc < outputHeight: while rc < outputHeight:
rk = r - rc * dr rk = r - rc * dr
if rk < 0: if rk < 0:
...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None): ...@@ -436,7 +457,9 @@ def computeR(W, b, d, H, Rshape=None):
if tk < 0: if tk < 0:
break break
R[i, j, r, c, t] += numpy.dot(W[:, j, rk, ck, tk], H[i, :, rc, cc, tc] ) R[i, j, r, c, t] += numpy.dot(
W[:, j, rk, ck, tk],
H[i, :, rc, cc, tc])
tc += 1 tc += 1
"" # close loop over tc "" # close loop over tc
......
...@@ -5,9 +5,9 @@ import numpy as np ...@@ -5,9 +5,9 @@ import numpy as np
import theano import theano
import theano.tensor as T import theano.tensor as T
from theano.misc.pycuda_init import pycuda_available
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
from theano.ifelse import ifelse from theano.ifelse import ifelse
from theano.misc.pycuda_init import pycuda_available
if cuda_available: if cuda_available:
from theano.sandbox.cuda import (basic_ops, CudaNdarrayType, from theano.sandbox.cuda import (basic_ops, CudaNdarrayType,
...@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -448,7 +448,7 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
o1 = i1 + 1 o1 = i1 + 1
input_padded = T.zeros((b, ic, o0, o1), dtype='float32') input_padded = T.zeros((b, ic, o0, o1), dtype='float32')
input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1], input_padded = T.set_subtensor(input_padded[:, :, :i0, :i1],
input) input)
else: else:
o1 = i1 o1 = i1
input_padded = input input_padded = input
...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -523,9 +523,11 @@ def conv2d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -655,7 +657,7 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v, output_fft_s = mult_and_reduce(input_fft_v, filters_fft_v,
input_shape=input_fft_v_shape, input_shape=input_fft_v_shape,
filter_shape=filters_fft_v_shape) filter_shape=filters_fft_v_shape)
#output_fft_s = input_fft_v # output_fft_s = input_fft_v
# reshape for IFFT # reshape for IFFT
output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2)) output_fft_flat = output_fft_s.reshape((b * oc, o0, o1, o2 // 2 + 1, 2))
...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None, ...@@ -673,12 +675,16 @@ def conv3d_fft(input, filters, image_shape=None, filter_shape=None,
# special way because we specify explicitly here # special way because we specify explicitly here
# how much values are expected. # how much values are expected.
if border_mode == 'valid': if border_mode == 'valid':
output = output_circ[:, :, (f0-1):(f0-1 + i0-f0+1), (f1-1):(f1-1 + i1-f1+1), (f2-1):(f2-1 + i2-f2+1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 - f0 + 1),
(f1 - 1):(f1 - 1 + i1 - f1 + 1),
(f2 - 1):(f2 - 1 + i2 - f2 + 1)]
elif border_mode == 'full': elif border_mode == 'full':
output = output_circ[:, :, (f0-1):(f0-1 + i0+f0-1), (f1-1):(f1-1 + i1+f1-1), (f2-1):(f2-1 + i2+f2-1)] output = output_circ[:, :, (f0 - 1):(f0 - 1 + i0 + f0 - 1),
(f1 - 1):(f1 - 1 + i1 + f1 - 1),
(f2 - 1):(f2 - 1 + i2 + f2 - 1)]
else: else:
raise ValueError('invalid mode') raise ValueError('invalid mode')
#output = output_circ[:, :, :, :, :] # output = output_circ[:, :, :, :, :]
# Rescale manually. This is just a factor that comes in during the # Rescale manually. This is just a factor that comes in during the
# trip through FFT and inverse FFT. # trip through FFT and inverse FFT.
......
...@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn): ...@@ -76,7 +76,7 @@ def inline_reduce(N, buf, pos, count, manner_fn):
rest of the buffer is trashed by this function. rest of the buffer is trashed by this function.
Notes Notes
----- -----
buf should be in gpu shared memory, we access it many times. buf should be in gpu shared memory, we access it many times.
""" """
...@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount): ...@@ -167,29 +167,26 @@ def inline_softmax(N, buf, buf2, threadPos, threadCount):
We use __i as an int variable in a loop. We use __i as an int variable in a loop.
""" """
return [ return [ # get max of buf (trashing all but buf[0])
# get max of buf (trashing all but buf[0]) inline_reduce_max(N, buf, threadPos, threadCount),
inline_reduce_max(N, buf, threadPos, threadCount), '__syncthreads()',
'__syncthreads()', 'float row_max = ' + buf + '[0]',
'float row_max = ' + buf + '[0]', '__syncthreads()',
'__syncthreads()', 'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' +
'for(int __i=' + threadPos + '; __i<' + N + threadCount + '){',
'; __i+=' + threadCount + '){', buf + '[__i] = exp(' + buf2 + '[__i] - row_max)',
buf + '[__i] = exp(' + buf2 + '[__i] - row_max)', buf2 + '[__i] = ' + buf + '[__i]', '}',
buf2 + '[__i] = ' + buf + '[__i]', '__syncthreads()',
'}', inline_reduce_sum(N, buf, threadPos, threadCount),
'__syncthreads()', '__syncthreads()',
inline_reduce_sum(N, buf, threadPos, threadCount), 'float row_sum = ' + buf + '[0]',
'__syncthreads()', '__syncthreads()',
'float row_sum = ' + buf + '[0]', # divide each exp() result by the sum to complete the job.
'__syncthreads()', 'for(int __i=' + threadPos + '; __i<' + N +
# divide each exp() result by the sum to complete the job. '; __i+=' + threadCount + '){',
'for(int __i=' + threadPos + '; __i<' + N + buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}',
'; __i+=' + threadCount + '){', '__syncthreads()',
buf + '[__i] = ' + buf2 + '[__i] / row_sum', ]
'}',
'__syncthreads()',
]
@code_version((1,)) @code_version((1,))
...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count, ...@@ -241,8 +238,7 @@ def inline_reduce_fixed_shared(N, buf, x, stride_x, pos, count,
init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals()) init = manner_init("%(x)s[%(pos)s * %(stride_x)s]" % locals())
loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" % loop_line = manner_fn("red", manner_init("%(x)s[i * %(stride_x)s]" %
locals())) locals()))
loop_line2 = manner_fn("%s[%s]" % (buf, pos), loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf)
"%s[i]" % buf)
r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos))
r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos)) r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos))
r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos)) r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos))
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
# This is work in progress # This is work in progress
from theano import Op, Apply, tensor from theano import Apply, tensor
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.sandbox.cuda import cuda_available, GpuOp from theano.sandbox.cuda import cuda_available, GpuOp
......
...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp): ...@@ -578,45 +578,46 @@ class GpuSoftmax(GpuOp):
""" % locals() """ % locals()
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
ret1 = nvcc_kernel("kSoftmax_%s" % nodename, ret1 = nvcc_kernel(
params=['int M', 'int N', "kSoftmax_%s" % nodename,
'const float * x', 'const int sx0', 'const int sx1', params=['int M', 'int N',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'const float * x',
body=[ 'const int sx0',
"extern __shared__ float buf[]", 'const int sx1',
"float * buf2 = buf + N", 'float * sm',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'const int sm_s0',
" blockIDX += gridDim.x){", 'const int sm_s1'],
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", body=["extern __shared__ float buf[]",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "float * buf2 = buf + N",
"buf2[tx] = buf[tx]", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"}", " blockIDX += gridDim.x){",
"__syncthreads()", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
inline_softmax('N', 'buf', 'buf2', "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
'threadIdx.x', 'blockDim.x'), "buf2[tx] = buf[tx]", "}", "__syncthreads()",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", inline_softmax('N',
# This set all value correctly 'buf',
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", 'buf2',
"}", 'threadIdx.x',
"__syncthreads()", 'blockDim.x'),
"}", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
]) # This set all value correctly
ret2 = nvcc_kernel("kSoftmax_fixed_shared%s" % nodename, "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
params=['int M', 'int N', "__syncthreads()", "}", ])
'const float * x', 'const int sx0', 'const int sx1', ret2 = nvcc_kernel(
'float * sm', 'const int sm_s0', 'const int sm_s1'], "kSoftmax_fixed_shared%s" % nodename,
body=[ params=['int M', 'int N',
"extern __shared__ float buf[]", 'const float * x', 'const int sx0', 'const int sx1',
"for (int blockIDX = blockIdx.x; blockIDX < M;" 'float * sm', 'const int sm_s0', 'const int sm_s1'],
" blockIDX += gridDim.x){", body=["extern __shared__ float buf[]",
"const float *x_ptr = &x[blockIDX * sx0]", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"float *sm_ptr = &sm[blockIDX * sm_s0]", " blockIDX += gridDim.x){",
inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', "const float *x_ptr = &x[blockIDX * sx0]",
'sm_ptr', 'sm_s1', "float *sm_ptr = &sm[blockIDX * sm_s0]",
'threadIdx.x', 'blockDim.x'), inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1',
"__syncthreads()", 'sm_ptr', 'sm_s1',
"}", 'threadIdx.x',
]) 'blockDim.x'),
"__syncthreads()", "}", ])
return ret1 + "\n" + ret2 return ret1 + "\n" + ret2
gpu_softmax = GpuSoftmax() gpu_softmax = GpuSoftmax()
...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -768,25 +769,20 @@ class GpuSoftmaxWithBias(GpuOp):
'const float * x', 'const int sx0', 'const int sx1', 'const float * x', 'const int sx0', 'const int sx1',
'const float * b', 'const int sb0', 'const float * b', 'const int sb0',
'float * sm', 'const int sm_s0', 'const int sm_s1'], 'float * sm', 'const int sm_s0', 'const int sm_s1'],
body=[ body=["extern __shared__ float buf[]",
"extern __shared__ float buf[]", "float * buf2 = buf + N",
"float * buf2 = buf + N", "for (int blockIDX = blockIdx.x; blockIDX < M;"
"for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){",
" blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = x[blockIDX * sx0 + tx * sx1]",
"buf[tx] = x[blockIDX * sx0 + tx * sx1]", "buf[tx] += b[tx * sb0]",
"buf[tx] += b[tx * sb0]", "buf2[tx] = buf[tx]", "}",
"buf2[tx] = buf[tx]", "__syncthreads()", inline_softmax('N', 'buf', 'buf2',
"}", 'threadIdx.x',
"__syncthreads()", 'blockDim.x'),
inline_softmax('N', 'buf', 'buf2', "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){",
'threadIdx.x', 'blockDim.x'), "sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]", "}",
"for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "__syncthreads()", "}", ])
"sm[blockIDX * sm_s0 + tx * sm_s1] = buf[tx]",
"}",
"__syncthreads()",
"}",
])
ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename, ret2 = nvcc_kernel("kSoftmaxWithBias_fixed_shared%s" % nodename,
params=['int M', 'int N', params=['int M', 'int N',
'const float * x', 'const float * x',
...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp): ...@@ -802,7 +798,8 @@ class GpuSoftmaxWithBias(GpuOp):
"float *sm_ptr = &sm[blockIDX * sm_s0]", "float *sm_ptr = &sm[blockIDX * sm_s0]",
inline_softmax_fixed_shared('N', 'buf', inline_softmax_fixed_shared('N', 'buf',
'x_ptr', 'sx1', 'x_ptr', 'sx1',
'sm_ptr', 'sm_s1', 'sm_ptr',
'sm_s1',
'threadIdx.x', 'threadIdx.x',
'blockDim.x', 'blockDim.x',
'b', 'sb0'), 'b', 'sb0'),
......
...@@ -4,7 +4,6 @@ import logging ...@@ -4,7 +4,6 @@ import logging
import os import os
import subprocess import subprocess
import sys import sys
import warnings
from locale import getpreferredencoding from locale import getpreferredencoding
import numpy import numpy
...@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler): ...@@ -249,8 +248,9 @@ class NVCC_compiler(Compiler):
_logger.debug('Writing module C++ code to %s', cppfilename) _logger.debug('Writing module C++ code to %s', cppfilename)
cppfile.write(src_code) cppfile.write(src_code)
lib_filename = os.path.join(location, '%s.%s' % lib_filename = os.path.join(
(module_name, get_lib_extension())) location, '%s.%s' %
(module_name, get_lib_extension()))
_logger.debug('Generating shared lib %s', lib_filename) _logger.debug('Generating shared lib %s', lib_filename)
# TODO: Why do these args cause failure on gtx285 that has 1.3 # TODO: Why do these args cause failure on gtx285 that has 1.3
...@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler): ...@@ -268,7 +268,7 @@ class NVCC_compiler(Compiler):
continue continue
for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I', for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I',
'-L', '--fmad', '--ftz', '--maxrregcount', '-L', '--fmad', '--ftz', '--maxrregcount',
'--prec-div', '--prec-sqrt', '--use_fast_math', '--prec-div', '--prec-sqrt', '--use_fast_math',
'-fmad', '-ftz', '-maxrregcount', '-fmad', '-ftz', '-maxrregcount',
'-prec-div', '-prec-sqrt', '-use_fast_math', '-prec-div', '-prec-sqrt', '-use_fast_math',
'--use-local-env', '--cl-version=']: '--use-local-env', '--cl-version=']:
...@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler): ...@@ -311,7 +311,7 @@ class NVCC_compiler(Compiler):
# https://wiki.debian.org/RpathIssue for details. # https://wiki.debian.org/RpathIssue for details.
if (not type(config.cuda).root.is_default and if (not type(config.cuda).root.is_default and
os.path.exists(os.path.join(config.cuda.root, 'lib'))): os.path.exists(os.path.join(config.cuda.root, 'lib'))):
rpaths.append(os.path.join(config.cuda.root, 'lib')) rpaths.append(os.path.join(config.cuda.root, 'lib'))
if sys.platform != 'darwin': if sys.platform != 'darwin':
...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler): ...@@ -341,7 +341,7 @@ class NVCC_compiler(Compiler):
indexof = cmd.index('-u') indexof = cmd.index('-u')
cmd.pop(indexof) # Remove -u cmd.pop(indexof) # Remove -u
cmd.pop(indexof) # Remove argument to -u cmd.pop(indexof) # Remove argument to -u
except ValueError as e: except ValueError:
done = True done = True
# CUDA Toolkit v4.1 Known Issues: # CUDA Toolkit v4.1 Known Issues:
...@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler): ...@@ -359,11 +359,13 @@ class NVCC_compiler(Compiler):
try: try:
os.chdir(location) os.chdir(location)
p = subprocess.Popen( p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2] nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2]
console_encoding = getpreferredencoding() console_encoding = getpreferredencoding()
nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding) nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding)
nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding) nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding)
p = subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
finally: finally:
os.chdir(orig_dir) os.chdir(orig_dir)
......
差异被折叠。
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
import numpy import numpy
import theano.gof import theano.gof
from theano.compat import PY3 from theano.compat import PY3
...@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt) ...@@ -17,12 +7,21 @@ from theano.tensor import (get_vector_length, cast, opt)
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer, Variable from theano.gof import local_optimizer, Variable
__authors__ = "James Bergstra"
__copyright__ = "(c) 2011, University of Montreal"
__license__ = "3-clause BSD License"
__contact__ = "theano-dev@googlegroups.com"
"""
Define CURAND_RandomStreams - backed by CURAND.
"""
config = theano.config config = theano.config
class CURAND_Base(GpuOp): class CURAND_Base(GpuOp):
""" """
Base class for a random number generator implemented in CURAND. Base class for a random number generator implemented in CURAND.
The random number generator itself is an opaque reference managed by The random number generator itself is an opaque reference managed by
...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp): ...@@ -70,8 +69,7 @@ class CURAND_Base(GpuOp):
Return a tuple of attributes that define the Op. Return a tuple of attributes that define the Op.
""" """
return ( return (self.destructive,
self.destructive,
self.output_type, self.output_type,
self.seed, self.seed,
) )
...@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp): ...@@ -88,7 +86,7 @@ class CURAND_Base(GpuOp):
def make_node(self, generator, size): def make_node(self, generator, size):
return theano.gof.Apply(self, [generator, size], return theano.gof.Apply(self, [generator, size],
[generator.type(), self.output_type()]) [generator.type(), self.output_type()])
@classmethod @classmethod
def new_auto_update(cls, generator, ndim, dtype, size, seed): def new_auto_update(cls, generator, ndim, dtype, size, seed):
...@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp): ...@@ -101,10 +99,9 @@ class CURAND_Base(GpuOp):
v_size = theano.tensor.as_tensor_variable(size) v_size = theano.tensor.as_tensor_variable(size)
if ndim is None: if ndim is None:
ndim = get_vector_length(v_size) ndim = get_vector_length(v_size)
self = cls( self = cls(output_type=CudaNdarrayType((False,) * ndim),
output_type=CudaNdarrayType((False,) * ndim), seed=seed,
seed=seed, destructive=False)
destructive=False)
o_gen, sample = self(generator, cast(v_size, 'int32')) o_gen, sample = self(generator, cast(v_size, 'int32'))
...@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object): ...@@ -282,7 +279,7 @@ class CURAND_RandomStreams(object):
RandomStreams instance that creates CURAND-based random variables. RandomStreams instance that creates CURAND-based random variables.
One caveat is that generators are not serializable. One caveat is that generators are not serializable.
Parameters Parameters
---------- ----------
seed : int seed : int
...@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object): ...@@ -319,7 +316,7 @@ class CURAND_RandomStreams(object):
return rval return rval
def uniform(self, size, low=0.0, high=1.0, ndim=None, def uniform(self, size, low=0.0, high=1.0, ndim=None,
dtype=config.floatX): dtype=config.floatX):
""" """
Return symbolic tensor of uniform numbers. Return symbolic tensor of uniform numbers.
...@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object): ...@@ -327,14 +324,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable) assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg for i in size]), msg
else: else:
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size) s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size, u = CURAND_Uniform.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed()) self.next_seed())
self.state_updates.append(u.update) self.state_updates.append(u.update)
rval = u * (high - low) + low rval = u * (high - low) + low
if u.type.broadcastable != rval.type.broadcastable: if u.type.broadcastable != rval.type.broadcastable:
...@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object): ...@@ -342,10 +339,10 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of ' 'Increase the size to match the broadcasting pattern of '
'low and `high` arguments' 'low and `high` arguments'
) )
return rval return rval
def normal(self, size=None, avg=0.0, std=1.0, ndim=None, def normal(self, size=None, avg=0.0, std=1.0, ndim=None,
dtype=config.floatX): dtype=config.floatX):
""" """
Return symbolic tensor of normally-distributed numbers. Return symbolic tensor of normally-distributed numbers.
...@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object): ...@@ -359,14 +356,14 @@ class CURAND_RandomStreams(object):
if isinstance(size, tuple): if isinstance(size, tuple):
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert all([isinstance(i, int) or isinstance(i, Variable) assert all([isinstance(i, int) or isinstance(i, Variable)
for i in size]), msg for i in size]), msg
else: else:
msg = "size must be a tuple of int or a Theano variable" msg = "size must be a tuple of int or a Theano variable"
assert isinstance(size, Variable) and size.ndim == 1, msg assert isinstance(size, Variable) and size.ndim == 1, msg
generator = theano.shared(False) # makes a generic generator = theano.shared(False) # makes a generic
s_size = theano.tensor.as_tensor_variable(size) s_size = theano.tensor.as_tensor_variable(size)
u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
self.next_seed()) self.next_seed())
self.state_updates.append(u.update) self.state_updates.append(u.update)
rval = u * std + avg rval = u * std + avg
if u.type.broadcastable != rval.type.broadcastable: if u.type.broadcastable != rval.type.broadcastable:
...@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object): ...@@ -374,7 +371,7 @@ class CURAND_RandomStreams(object):
'Increase the size to match the broadcasting pattern of `low`' 'Increase the size to match the broadcasting pattern of `low`'
'and `high` arguments' 'and `high` arguments'
) )
return rval return rval
@local_optimizer([CURAND_Base]) @local_optimizer([CURAND_Base])
...@@ -386,5 +383,5 @@ def local_destructive(node): ...@@ -386,5 +383,5 @@ def local_destructive(node):
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('CURAND_destructive', optdb.register('CURAND_destructive',
opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run', opt.in2out(local_destructive, ignore_newtrees=True),
'inplace') 99, 'fast_run', 'inplace')
...@@ -6,7 +6,7 @@ import theano ...@@ -6,7 +6,7 @@ import theano
try: try:
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
except ImportError: except ImportError:
# To have the GPU back-end work without nose, we need this file to # To have the GPU back-end work without nose, we need this file to
...@@ -33,8 +33,9 @@ def test_nvidia_driver1(): ...@@ -33,8 +33,9 @@ def test_nvidia_driver1():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 2
if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1: if sum(isinstance(node.op, B.GpuCAReduce) for node in topo) != 1:
msg = '\n\t'.join(['Expected exactly one occurrence of GpuCAReduce ' + msg = '\n\t'.join(
'but got:']+[str(app) for app in topo]) ['Expected exactly one occurrence of GpuCAReduce ' +
'but got:'] + [str(app) for app in topo])
raise AssertionError(msg) raise AssertionError(msg)
if not numpy.allclose(f(), a.sum()): if not numpy.allclose(f(), a.sum()):
raise Exception("The nvidia driver version installed with this OS " raise Exception("The nvidia driver version installed with this OS "
......
...@@ -5,24 +5,22 @@ import itertools ...@@ -5,24 +5,22 @@ import itertools
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import numpy as np import numpy as np
from six.moves import xrange from six.moves import xrange
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available is False: if cuda_ndarray.cuda_available:
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
else:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.tests.test_extra_ops
from theano.sandbox.cuda.extra_ops import GpuCumsum
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else: else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu') mode_with_gpu = theano.compile.mode.get_default_mode().including('gpu')
from theano import tensor as T
import theano
from theano.tensor.extra_ops import cumsum, CumsumOp
from theano.tests import unittest_tools as utt
class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
mode = mode_with_gpu mode = mode_with_gpu
...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -129,11 +127,11 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
utt.assert_allclose(np.cumsum(a[:i]), f(a[:i])) utt.assert_allclose(np.cumsum(a[:i]), f(a[:i]))
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a = np.random.random((block_max_size+2,)).astype("float32") a = np.random.random((block_max_size + 2,)).astype("float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
# Use recursive cumsum # Use recursive cumsum
a = np.ones((block_max_size*(block_max_size+1)+2,), a = np.ones((block_max_size * (block_max_size + 1) + 2,),
dtype="float32") dtype="float32")
utt.assert_allclose(np.cumsum(a), f(a)) utt.assert_allclose(np.cumsum(a), f(a))
...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -159,21 +157,22 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks # Use multiple GPU threadblocks
a_shape = [5, 5] a_shape = [5, 5]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks # Use multiple GPU gridblocks
a_shape = [4, 4] a_shape = [4, 4]
a_shape[1-shape_axis] = self.max_grid_size1+1 a_shape[1 - shape_axis] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5) utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5)
# Use recursive cumsum # Use recursive cumsum
a_shape = [3, 3] a_shape = [3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype("float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum3D(self): def test_GpuCumsum3D(self):
...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp): ...@@ -198,32 +197,34 @@ class TestGpuCumsum(theano.tensor.tests.test_extra_ops.TestCumsumOp):
# Use multiple GPU threadblocks (along accumulation axis) # Use multiple GPU threadblocks (along accumulation axis)
a_shape = [2, 2, 2] a_shape = [2, 2, 2]
a_shape[shape_axis] = block_max_size+2 a_shape[shape_axis] = block_max_size + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use multiple GPU gridblocks (not along accumulation axis) # Use multiple GPU gridblocks (not along accumulation axis)
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+1) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
a_shape = [5, 5, 5] a_shape = [5, 5, 5]
a_shape[(shape_axis+2) % 3] = self.max_grid_size1+1 a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
if axis is None: if axis is None:
# Avoid floating point error # Avoid floating point error
a = np.sign(a-0.5).astype("float32") a = np.sign(a - 0.5).astype("float32")
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
# Use recursive cumsum (along accumulation axis) # Use recursive cumsum (along accumulation axis)
a_shape = [3, 3, 3] a_shape = [3, 3, 3]
a_shape[shape_axis] = block_max_size*(block_max_size+1)+2 a_shape[shape_axis] = block_max_size * (
block_max_size + 1) + 2
a = np.random.random(a_shape).astype("float32") a = np.random.random(a_shape).astype("float32")
a = np.sign(a-0.5).astype("float32") # Avoid floating point error a = np.sign(a - 0.5).astype(
"float32") # Avoid floating point error
utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_GpuCumsum4D(self): def test_GpuCumsum4D(self):
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
import unittest import unittest
import numpy import numpy
import copy
import theano import theano
from theano.tests import unittest_tools as utt from theano.tests import unittest_tools as utt
# Skip tests if cuda_ndarray is not available. # Skip tests if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
from theano.sandbox.cuda import float32_shared_constructor as shared from theano.sandbox.cuda import float32_shared_constructor as shared
from theano.sandbox.cuda.blas import ( from theano.sandbox.cuda.blas import (
GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs) GpuCorr3dMM, GpuCorr3dMM_gradWeights, GpuCorr3dMM_gradInputs)
from theano.sandbox.cuda.basic_ops import gpu_contiguous from theano.sandbox.cuda.basic_ops import gpu_contiguous
import theano.sandbox.cuda as cuda_ndarray
if not cuda_ndarray.cuda_available:
raise SkipTest('Optional package cuda not available')
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase): ...@@ -122,7 +121,9 @@ class TestCorr3DMM(unittest.TestCase):
inputs = shared(inputs_val) inputs = shared(inputs_val)
filters = shared(filters_val) filters = shared(filters_val)
bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) bias = shared(numpy.zeros(filters_shape[4]).astype('float32'))
conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample, conv = theano.tensor.nnet.convTransp3D(W=filters,
b=bias,
d=subsample,
H=inputs) H=inputs)
f_ref = theano.function([], conv) f_ref = theano.function([], conv)
res_ref = f_ref() res_ref = f_ref()
......
...@@ -8,7 +8,7 @@ from theano.sandbox import cuda ...@@ -8,7 +8,7 @@ from theano.sandbox import cuda
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
......
...@@ -11,7 +11,7 @@ from theano import ifelse ...@@ -11,7 +11,7 @@ from theano import ifelse
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
if cuda.cuda_available == False: if cuda.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0): ...@@ -39,7 +39,7 @@ def freemem(extra_alloc=0):
theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated() theano_alloc = cuda.cuda_ndarray.cuda_ndarray.theano_allocated()
return ("(n malloc/theano mem allocated in KB)", return ("(n malloc/theano mem allocated in KB)",
n_mallocs + extra_alloc, n_mallocs + extra_alloc,
int(theano_alloc / 1024) + extra_size) int(theano_alloc / 1024))
return ("n malloc on the gpu", n_mallocs + extra_alloc) return ("n malloc on the gpu", n_mallocs + extra_alloc)
# I don't use the following by default as if there is other stuff running # I don't use the following by default as if there is other stuff running
...@@ -83,9 +83,12 @@ def test_memory(): ...@@ -83,9 +83,12 @@ def test_memory():
variables = cuda.shared_constructor(np.ones((shapes[1],), variables = cuda.shared_constructor(np.ones((shapes[1],),
dtype='float32')) dtype='float32'))
derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables)) derp = tensor.sum(tensor.dot(some_matrix[:shapes[0]], variables))
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
borrow=True, np.prod(variables.get_value(
return_internal_type=True).shape) * 4 / 1024, "kB") borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
...@@ -112,7 +115,7 @@ def test_memory(): ...@@ -112,7 +115,7 @@ def test_memory():
del obj del obj
# print "After deleting function 1", freemem() # print "After deleting function 1", freemem()
#assert mem2 == freemem(), (mem2, freemem()) # assert mem2 == freemem(), (mem2, freemem())
del grad del grad
print("After deleting function 2", freemem()) print("After deleting function 2", freemem())
...@@ -155,16 +158,19 @@ def test_memory_lazy(): ...@@ -155,16 +158,19 @@ def test_memory_lazy():
derp = ifelse.IfElse(1)(branch_select, derp = ifelse.IfElse(1)(branch_select,
derp, some_matrix[:shapes[0]].sum()) derp, some_matrix[:shapes[0]].sum())
derp += 1 derp += 1
print("Shared took ", np.prod(variables.get_value( print("Shared took ",
borrow=True, np.prod(variables.get_value(
return_internal_type=True).shape) * 4 / 1024, "kB") borrow=True,
return_internal_type=True).shape) *
4 / 1024,
"kB")
mem2 = freemem() mem2 = freemem()
print("Before compilation", mem2) print("Before compilation", mem2)
mem2_1 = freemem(extra_alloc=more_alloc1) mem2_1 = freemem(extra_alloc=more_alloc1)
obj = theano.function([some_vector, branch_select], derp, obj = theano.function([some_vector, branch_select], derp,
mode=mode_with_gpu) mode=mode_with_gpu)
#theano.printing.debugprint(obj, print_type=True) # theano.printing.debugprint(obj, print_type=True)
mem3 = freemem() mem3 = freemem()
print("After function compilation 1", mem3) print("After function compilation 1", mem3)
assert mem2_1 == mem3, (mem2_1, mem3) assert mem2_1 == mem3, (mem2_1, mem3)
......
...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']: ...@@ -24,7 +24,7 @@ if theano.config.mode not in ['FAST_RUN', 'Mode', 'ProfileMode']:
'otherwise it is too slow!') 'otherwise it is too slow!')
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
if tcn.cuda_available == False: if tcn.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
...@@ -68,7 +68,7 @@ def print_mode(mode): ...@@ -68,7 +68,7 @@ def print_mode(mode):
def print_diff_mode(a, b): def print_diff_mode(a, b):
if (a is not None and if (a is not None and
isinstance(a, (theano.compile.ProfileMode,)) and isinstance(a, (theano.compile.ProfileMode,)) and
isinstance(b, (theano.compile.ProfileMode,))): isinstance(b, (theano.compile.ProfileMode,))):
a.print_diff_summary(b) a.print_diff_summary(b)
...@@ -138,8 +138,8 @@ def test_run_nnet(): ...@@ -138,8 +138,8 @@ def test_run_nnet():
# print "cpu:", rval_cpu # print "cpu:", rval_cpu
# print "gpu:", rval_gpu # print "gpu:", rval_gpu
abs_diff, rel_diff = \ abs_diff, rel_diff = \
theano.gradient.numeric_grad.abs_rel_err(rval_gpu, theano.gradient.numeric_grad.abs_rel_err(rval_gpu,
rval_cpu) rval_cpu)
max_abs_diff = abs_diff.max() max_abs_diff = abs_diff.max()
# print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % ( # print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d" % (
# max_abs_diff, rel_diff.max(), n_in, n_hid) # max_abs_diff, rel_diff.max(), n_in, n_hid)
...@@ -147,19 +147,20 @@ def test_run_nnet(): ...@@ -147,19 +147,20 @@ def test_run_nnet():
rtol = 1e-4 rtol = 1e-4
if n_in * n_hid >= 2048 * 4096: if n_in * n_hid >= 2048 * 4096:
rtol = 7e-4 rtol = 7e-4
assert numpy.allclose(rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \ assert numpy.allclose(
("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff, rval_cpu, rval_gpu, rtol=rtol, atol=1e-6), \
rel_diff.max(), n_in, n_hid) ("max_abs_diff, max_rel_diff, n_in, n_hid", max_abs_diff,
rel_diff.max(), n_in, n_hid)
def test_run_nnet_med(): def test_run_nnet_med():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 128, 50, 4, n_train=10000) run_nnet(False, 10, 128, 50, 4, n_train=10000)
def test_run_nnet_small(): def test_run_nnet_small():
utt.seed_rng() utt.seed_rng()
rval_cpu = run_nnet(False, 10, 10, 4, 4, n_train=100000) run_nnet(False, 10, 10, 4, 4, n_train=100000)
def run_conv_nnet1(use_gpu): def run_conv_nnet1(use_gpu):
...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu): ...@@ -203,8 +204,11 @@ def run_conv_nnet1(use_gpu):
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -279,7 +283,9 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1)
conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2, conv_op1 = conv.ConvOp((n_kern, logical_hid_shape[0] // 2,
logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1) logical_hid_shape[1] // 2),
shape_kern1[2:],
n_kern1, n_batch, 1, 1)
hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))) hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))
hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle(( hid1 = tensor.tanh(conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((
...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST ...@@ -295,8 +301,11 @@ def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST
mode = get_mode(use_gpu) mode = get_mode(use_gpu)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
# for i, n in enumerate(train.maker.fgraph.toposort()): # for i, n in enumerate(train.maker.fgraph.toposort()):
# print i, n # print i, n
...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -376,13 +385,14 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
if downsample_ops: if downsample_ops:
hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))) hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))))
else: else:
hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x') hid = tensor.tanh(
))[:, :, ::2, ::2]) (conv_op(x, w0) + b0.dimshuffle(
(0, 'x', 'x')))[:, :, ::2, ::2])
hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x'))) hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x')))
hid_flat = hid1.reshape((n_batch, n_hid)) hid_flat = hid1.reshape((n_batch, n_hid))
out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c) out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c)
loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(out, loss = tensor.sum(tensor.nnet.crossentropy_categorical_1hot(
tensor.argmax(y, axis=1)) * lr) out, tensor.argmax(y, axis=1)) * lr)
# print 'loss type', loss.type # print 'loss type', loss.type
params = [w0, b0, w1, b1, v, c] params = [w0, b0, w1, b1, v, c]
...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, ...@@ -391,8 +401,11 @@ def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch,
mode = get_mode(use_gpu, check_isfinite) mode = get_mode(use_gpu, check_isfinite)
# print 'building pfunc ...' # print 'building pfunc ...'
train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, train = pfunc(
g in zip(params, gparams)]) [x, y, lr],
[loss],
mode=mode,
updates=[(p, p - g) for p, g in zip(params, gparams)])
if verbose: if verbose:
theano.printing.debugprint(train) theano.printing.debugprint(train)
...@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize, ...@@ -422,13 +435,13 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train, params, x_shape, y_shape, mode = build_conv_nnet2_classif( train, params, x_shape, y_shape, mode = build_conv_nnet2_classif(
use_gpu=use_gpu, use_gpu=use_gpu,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
if use_gpu: if use_gpu:
device = 'GPU' device = 'GPU'
...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize, ...@@ -440,10 +453,8 @@ def run_conv_nnet2_classif(use_gpu, seed, isize, ksize, bsize,
lr = theano._asarray(0.01, dtype='float32') lr = theano._asarray(0.01, dtype='float32')
rvals = my_zeros(n_train) rvals = my_zeros(n_train)
t0 = time.time()
for i in xrange(n_train): for i in xrange(n_train):
rvals[i] = train(xval, yval, lr)[0] rvals[i] = train(xval, yval, lr)[0]
t1 = time.time()
print_mode(mode) print_mode(mode)
if pickle and isinstance(mode, theano.compile.ProfileMode): if pickle and isinstance(mode, theano.compile.ProfileMode):
...@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -495,35 +506,36 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
compare = True compare = True
if not compare: if not compare:
return run_conv_nnet2_classif(use_gpu=use_gpu, return run_conv_nnet2_classif(
seed=seed, isize=isize, ksize=ksize, bsize=bsize, use_gpu=use_gpu,
n_train=n_train, seed=seed, isize=isize, ksize=ksize, bsize=bsize,
check_isfinite=check_isfinite, n_train=n_train,
pickle=pickle, check_isfinite=check_isfinite,
verbose=verbose, pickle=pickle,
version=version) verbose=verbose,
version=version)
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \ train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \
build_conv_nnet2_classif( build_conv_nnet2_classif(
use_gpu=False, use_gpu=False,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
utt.seed_rng(seed) # Seeds numpy.random with seed utt.seed_rng(seed) # Seeds numpy.random with seed
train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \ train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \
build_conv_nnet2_classif( build_conv_nnet2_classif(
use_gpu=True, use_gpu=True,
isize=isize, isize=isize,
ksize=ksize, ksize=ksize,
n_batch=bsize, n_batch=bsize,
verbose=verbose, verbose=verbose,
version=version, version=version,
check_isfinite=check_isfinite) check_isfinite=check_isfinite)
assert x_shape == x_shape_gpu assert x_shape == x_shape_gpu
assert y_shape == y_shape_gpu assert y_shape == y_shape_gpu
...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ...@@ -570,18 +582,6 @@ def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize,
finally: finally:
theano.tensor.basic.float32_atol = orig_float32_atol theano.tensor.basic.float32_atol = orig_float32_atol
if pickle:
if isinstance(cpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN CPU profile mode dump")
print(pickle.dumps(cpu_mode))
print("END CPU profile mode dump")
if isinstance(gpu_mode, theano.compile.ProfileMode):
import pickle
print("BEGIN GPU profile mode dump")
print(pickle.dumps(gpu_mode))
print("END GPU profile mode dump")
# print "CPU time: %.3f, GPU time: %.3f, speed up %f" % ( # print "CPU time: %.3f, GPU time: %.3f, speed up %f" % (
# (time_cpu, time_gpu, time_cpu/time_gpu)) # (time_cpu, time_gpu, time_cpu/time_gpu))
# print "Estimated time for one pass through MNIST with CPU: %f" % ( # print "Estimated time for one pass through MNIST with CPU: %f" % (
......
# Skip test if cuda_ndarray is not available. # Skip test if cuda_ndarray is not available.
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from nose.plugins.skip import SkipTest from nose.plugins.skip import SkipTest
import unittest
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
import theano.sandbox.cuda as cuda_ndarray import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False: if cuda_ndarray.cuda_available is False:
raise SkipTest('Optional package cuda disabled') raise SkipTest('Optional package cuda disabled')
import theano.tensor.nnet.tests.test_neighbours
from theano.sandbox.cuda.neighbours import GpuImages2Neibs
if theano.config.mode == 'FAST_COMPILE': if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu') mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
......
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论