提交 1414b2d2 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6264 from notoraptor/remove-config-gpuarray-sync

Remove config.gpuarray.sync from Theano code.
...@@ -193,11 +193,14 @@ AddConfigVar( ...@@ -193,11 +193,14 @@ AddConfigVar(
in_c_key=False) in_c_key=False)
def deprecated_gpuarray_sync(val):
if val:
raise RuntimeError("Flag gpuarray.sync is deprecated and will be removed in next Theano release.")
return False
AddConfigVar('gpuarray.sync', AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before """This flag is deprecated and will be removed in next Theano release.""",
returning. Setting this to True will slow down execution, ConfigParam(False, allow_override=False, filter=deprecated_gpuarray_sync),
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True) in_c_key=True)
AddConfigVar('gpuarray.preallocate', AddConfigVar('gpuarray.preallocate',
......
...@@ -849,8 +849,6 @@ class GpuAlloc(HideC, Alloc): ...@@ -849,8 +849,6 @@ class GpuAlloc(HideC, Alloc):
out[0][...] = v out[0][...] = v
else: else:
out[0][...] = v out[0][...] = v
if config.gpuarray.sync:
out[0].sync()
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
vv = inp[0] vv = inp[0]
...@@ -915,13 +913,10 @@ class GpuAlloc(HideC, Alloc): ...@@ -915,13 +913,10 @@ class GpuAlloc(HideC, Alloc):
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['params'], """ % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['params'],
fail=sub['fail'], memset_0=memset_0) fail=sub['fail'], memset_0=memset_0)
if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
def do_constant_folding(self, node): def do_constant_folding(self, node):
from . import subtensor, blas from . import subtensor, blas
...@@ -1382,7 +1377,7 @@ class GpuSplit(HideC, Split): ...@@ -1382,7 +1377,7 @@ class GpuSplit(HideC, Split):
# we reuse the perform of the CPU op, which is suitable # we reuse the perform of the CPU op, which is suitable
def c_code_cache_version(self): def c_code_cache_version(self):
return (1,) return (2,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>'] return ['<numpy_compat.h>', '<gpuarray_helper.h>']
...@@ -1514,13 +1509,6 @@ class GpuSplit(HideC, Split): ...@@ -1514,13 +1509,6 @@ class GpuSplit(HideC, Split):
free(split_points); free(split_points);
""" """
if config.gpuarray.sync:
main_code += """
for (i = 0; i < splits_count; ++i) {
GpuArray_sync(&((*outputs[i])->ga));
}
"""
return main_code % locals() return main_code % locals()
...@@ -1649,7 +1637,6 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, ...@@ -1649,7 +1637,6 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype) typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
s = """ s = """
size_t dims[2] = {0, 0}; size_t dims[2] = {0, 0};
...@@ -1689,11 +1676,9 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off, ...@@ -1689,11 +1676,9 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
} }
} }
if(%(sync)d)
GpuArray_sync(&%(z)s->ga);
""" % locals() """ % locals()
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
...@@ -3,7 +3,7 @@ import os.path ...@@ -3,7 +3,7 @@ import os.path
from six import integer_types from six import integer_types
import theano import theano
from theano import Apply, config, Op from theano import Apply, Op
from theano.compile import optdb from theano.compile import optdb
from theano.gof import LocalOptGroup, ParamsType from theano.gof import LocalOptGroup, ParamsType
...@@ -133,14 +133,11 @@ class GpuGemv(BlasOp): ...@@ -133,14 +133,11 @@ class GpuGemv(BlasOp):
%(fail)s %(fail)s
} }
""" % vars """ % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
gpugemv_no_inplace = GpuGemv(inplace=False) gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True) gpugemv_inplace = GpuGemv(inplace=True)
...@@ -222,14 +219,11 @@ class GpuGemm(BlasOp): ...@@ -222,14 +219,11 @@ class GpuGemm(BlasOp):
%(fail)s %(fail)s
} }
""" % vars """ % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (6,) return (7,)
gpugemm_no_inplace = GpuGemm(inplace=False) gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True) gpugemm_inplace = GpuGemm(inplace=True)
...@@ -293,14 +287,11 @@ class GpuGer(BlasOp): ...@@ -293,14 +287,11 @@ class GpuGer(BlasOp):
%(fail)s %(fail)s
} }
""" % vars """ % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
gpuger_no_inplace = GpuGer(inplace=False) gpuger_no_inplace = GpuGer(inplace=False)
...@@ -361,14 +352,11 @@ class GpuDot22(BlasOp): ...@@ -361,14 +352,11 @@ class GpuDot22(BlasOp):
%(fail)s %(fail)s
} }
""" % vars """ % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (4,) return (5,)
gpu_dot22 = GpuDot22() gpu_dot22 = GpuDot22()
...@@ -444,14 +432,11 @@ class GpuGemmBatch(BlasOp): ...@@ -444,14 +432,11 @@ class GpuGemmBatch(BlasOp):
%(fail)s; %(fail)s;
} }
""" % vars """ % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code return code
def c_code_cache_version(self): def c_code_cache_version(self):
return (3,) return (4,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False) gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True) gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
...@@ -549,7 +534,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -549,7 +534,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def c_code_cache_version(self): def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file). # Raise this whenever modifying the C code (including the file).
return (9,) return (10,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None): def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
""" """
...@@ -633,16 +618,7 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -633,16 +618,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)): if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'") raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '-1' width = '-1'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy() sub = sub.copy()
sub.update(locals()) sub.update(locals())
...@@ -831,8 +807,6 @@ class BaseGpuCorrMM(CGpuKernelBase): ...@@ -831,8 +807,6 @@ class BaseGpuCorrMM(CGpuKernelBase):
} }
assert (out2 == %(out)s); assert (out2 == %(out)s);
%(sync)s
""" % sub """ % sub
...@@ -1161,7 +1135,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase): ...@@ -1161,7 +1135,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
def c_code_cache_version(self): def c_code_cache_version(self):
# raise this whenever modifying the code below. # raise this whenever modifying the code below.
return (7,) return (8,)
def c_code_helper(self, bottom, weights, top, direction, sub, def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None): height=None, width=None, depth=None):
...@@ -1258,16 +1232,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase): ...@@ -1258,16 +1232,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)): if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'") raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
depth = '-1' depth = '-1'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorr3dMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy() sub = sub.copy()
sub.update(locals()) sub.update(locals())
...@@ -1490,8 +1455,6 @@ class BaseGpuCorr3dMM(CGpuKernelBase): ...@@ -1490,8 +1455,6 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
} }
assert (out2 == %(out)s); assert (out2 == %(out)s);
%(sync)s
""" % sub """ % sub
......
...@@ -3,7 +3,7 @@ import copy ...@@ -3,7 +3,7 @@ import copy
import numpy as np import numpy as np
import theano import theano
from theano import Apply, scalar, config, Op from theano import Apply, scalar, Op
from six.moves import StringIO, xrange from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined from theano.gof.utils import MethodNotDefined
from theano.scalar import Scalar, Composite from theano.scalar import Scalar, Composite
...@@ -371,18 +371,6 @@ class GpuElemwise(HideC, Elemwise): ...@@ -371,18 +371,6 @@ class GpuElemwise(HideC, Elemwise):
} }
""" % dict(fail=sub['fail']) """ % dict(fail=sub['fail'])
if config.gpuarray.sync:
z = outputs[0]
code += """
err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %%s.",
GpuArray_error(&%(z)s->ga, err));
%(fail)s;
}
""" % locals()
return str(code) return str(code)
# To disable the superclass perform. # To disable the superclass perform.
...@@ -395,7 +383,7 @@ class GpuElemwise(HideC, Elemwise): ...@@ -395,7 +383,7 @@ class GpuElemwise(HideC, Elemwise):
def c_code_cache_version(self): def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version() ver = self.scalar_op.c_code_cache_version()
if ver: if ver:
return (8, ver) return (9, ver)
else: else:
return ver return ver
...@@ -843,15 +831,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -843,15 +831,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(err_check)s %(err_check)s
""" % locals(), file=sio) """ % locals(), file=sio)
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
%(sync)s
""" % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
def _k_decl(self, node, nodename, pattern=None, def _k_decl(self, node, nodename, pattern=None,
...@@ -1128,12 +1107,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1128,12 +1107,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
if(PyGpuArray_SIZE(%(x)s)==0){ if(PyGpuArray_SIZE(%(x)s)==0){
...@@ -1155,7 +1129,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1155,7 +1129,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads; size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params); int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1272,12 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1272,12 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
int verbose = 0; int verbose = 0;
...@@ -1325,7 +1293,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1325,7 +1293,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
}else{ }else{
GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s; GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1}; size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
...@@ -1354,7 +1321,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1354,7 +1321,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params); int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1375,12 +1341,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1375,12 +1341,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print(""" print("""
{ {
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2]; //int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
...@@ -1429,7 +1389,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1429,7 +1389,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} }
else else
{ {
...@@ -1470,7 +1429,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1470,7 +1429,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
); );
%(makecall)s %(makecall)s
} }
%(sync)s
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1506,12 +1464,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1506,12 +1464,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
# use threadIdx.x for i0 # use threadIdx.x for i0
# use blockIdx.x for i1 # use blockIdx.x for i1
# use blockIdx.y for i2 # use blockIdx.y for i2
...@@ -1562,7 +1514,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1562,7 +1514,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1}; (void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} }
} }
""" % locals(), file=sio) """ % locals(), file=sio)
...@@ -1749,7 +1700,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -1749,7 +1700,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio) """ % locals(), file=sio)
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
version = [20] # the version corresponding to the c code in this Op version = [21] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend... # now we insert versions for the ops on which we depend...
scalar_node = Apply( scalar_node = Apply(
...@@ -2581,10 +2532,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2581,10 +2532,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s %(fail)s
} }
if (%(sync)d) """ % dict(out=out[0], inp=inp[0], fail=sub['fail'])
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
k = self.get_kernel_cache(node) k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size, _, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim) node.inputs[0].ndim)
...@@ -2719,23 +2667,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2719,23 +2667,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(output)s = tmp; %(output)s = tmp;
} }
if (%(sync)d) { """ % dict(k_var='k_reduk_' + name,
err = GpuArray_sync(&%(output)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
}
""" % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
ls=ls, fail=sub['fail'], output=output, input=input, ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype)) cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code return code
def c_code_cache_version_apply(self, node): def c_code_cache_version_apply(self, node):
return (2, self.kernel_version(node)) return (3, self.kernel_version(node))
def generate_kernel(self, node, odtype, redux): def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add): if isinstance(self.scalar_op, scalar.basic.Add):
......
...@@ -10,7 +10,7 @@ except ImportError: ...@@ -10,7 +10,7 @@ except ImportError:
import theano import theano
import theano.sandbox.multinomial import theano.sandbox.multinomial
from theano import Apply, config from theano import Apply
from theano.gof import Op from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value from theano.tensor import NotScalarConstantError, get_scalar_constant_value
...@@ -137,7 +137,6 @@ KERNEL void k_multi_warp_multinomial( ...@@ -137,7 +137,6 @@ KERNEL void k_multi_warp_multinomial(
out, = outputs out, = outputs
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
...@@ -212,15 +211,14 @@ KERNEL void k_multi_warp_multinomial( ...@@ -212,15 +211,14 @@ KERNEL void k_multi_warp_multinomial(
GpuKernel_error(&%(kname)s, err)); GpuKernel_error(&%(kname)s, err));
%(fail)s; %(fail)s;
} }
if(%(sync)d)
GpuArray_sync(&(out->ga));
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (6,)
class GPUAChoiceFromUniform(GpuKernelBase, Op): class GPUAChoiceFromUniform(GpuKernelBase, Op):
...@@ -360,7 +358,6 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -360,7 +358,6 @@ KERNEL void k_multi_warp_multinomial_wor(
replace = int(self.replace) replace = int(self.replace)
fail = sub['fail'] fail = sub['fail']
ctx = sub['params'] ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar kname = self.gpu_kernels(node, name)[0].objvar
s = """ s = """
PyGpuArrayObject * pvals = %(pvals)s; PyGpuArrayObject * pvals = %(pvals)s;
...@@ -447,15 +444,14 @@ KERNEL void k_multi_warp_multinomial_wor( ...@@ -447,15 +444,14 @@ KERNEL void k_multi_warp_multinomial_wor(
Py_DECREF(pvals_copy); Py_DECREF(pvals_copy);
%(fail)s; %(fail)s;
} }
if(%(sync)d)
GpuArray_sync(&(out->ga));
Py_DECREF(pvals_copy); Py_DECREF(pvals_copy);
} // END NESTED SCOPE } // END NESTED SCOPE
""" % locals() """ % locals()
return s return s
def c_code_cache_version(self): def c_code_cache_version(self):
return (9,) return (10,)
@register_opt('fast_compile') @register_opt('fast_compile')
......
from __future__ import absolute_import, print_function, division from __future__ import absolute_import, print_function, division
from theano import Op, Apply, config from theano import Op, Apply
from theano.gof import ParamsType from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T import theano.tensor as T
...@@ -45,7 +45,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -45,7 +45,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name=ten4.type.context_name)()]) context_name=ten4.type.context_name)()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) return (14,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -284,12 +284,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -284,12 +284,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)s; %(fail)s;
} }
""" % dict(fail=sub['fail']) """ % dict(fail=sub['fail'])
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % dict(z=out[0], err_check=err_check)
# NB: To reduce C code variability: # NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize # For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node) # For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
...@@ -563,13 +558,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -563,13 +558,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(void *)&%(z)s->ga.offset}; (void *)&%(z)s->ga.offset};
err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params); err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} // END NESTED SCOPE } // END NESTED SCOPE
""" % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0], """ % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
dtype_neib_shape=node.inputs[1].dtype, dtype_neib_shape=node.inputs[1].dtype,
dtype_neib_step=node.inputs[2].dtype, dtype_neib_step=node.inputs[2].dtype,
err_check=err_check, err_check=err_check,
sync=sync,
name=name, name=name,
params=sub['params'], params=sub['params'],
fail=sub['fail']) fail=sub['fail'])
......
...@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division ...@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
import os import os
import numpy as np import numpy as np
from theano import Op, Apply, config from theano import Op, Apply
from six import StringIO from six import StringIO
try: try:
...@@ -187,12 +187,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -187,12 +187,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
sio = StringIO() sio = StringIO()
print(""" print("""
if (PyGpuArray_DIMS(%(x)s)[0] != if (PyGpuArray_DIMS(%(x)s)[0] !=
...@@ -235,13 +229,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op): ...@@ -235,13 +229,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
%(am)s->ga.data, %(am)s->ga.offset, %(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s); PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s %(err_check)s
%(sync)s
} }
""" % locals(), file=sio) """ % locals(), file=sio)
return sio.getvalue() return sio.getvalue()
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) return (14,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias() gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
...@@ -267,7 +260,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -267,7 +260,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()]) return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self): def c_code_cache_version(self):
return (13,) return (14,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -296,12 +289,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -296,12 +289,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """ return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ? const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
...@@ -378,7 +365,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op): ...@@ -378,7 +365,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
(void *)&stride_DX0, (void *)&stride_DX1}; (void *)&stride_DX0, (void *)&stride_DX1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params); int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s %(err_check)s
%(sync)s
} }
assert(%(dx)s); assert(%(dx)s);
""" % locals() """ % locals()
...@@ -465,7 +451,7 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -465,7 +451,7 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape return shape
def c_code_cache_version(self): def c_code_cache_version(self):
return (16,) return (17,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -487,15 +473,6 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -487,15 +473,6 @@ class GpuSoftmax(GpuKernelBase, Op):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
else:
sync = ""
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -555,7 +532,6 @@ class GpuSoftmax(GpuKernelBase, Op): ...@@ -555,7 +532,6 @@ class GpuSoftmax(GpuKernelBase, Op):
msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err); msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
} }
%(err_check)s %(err_check)s
%(sync)s
} }
} }
assert(%(z)s); assert(%(z)s);
...@@ -751,7 +727,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -751,7 +727,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
return [shape[0]] return [shape[0]]
def c_code_cache_version(self): def c_code_cache_version(self):
return (15,) return (16,)
def c_headers(self): def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>'] return ['<numpy_compat.h>', '<gpuarray/types.h>']
...@@ -775,13 +751,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -775,13 +751,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
%(fail)s; %(fail)s;
} }
""" % locals() """ % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
return """ return """
if (PyGpuArray_NDIM(%(x)s) != 2) if (PyGpuArray_NDIM(%(x)s) != 2)
{ {
...@@ -856,7 +825,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op): ...@@ -856,7 +825,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err); msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
} }
%(err_check)s %(err_check)s
%(sync)s
} }
} }
assert(%(z)s); assert(%(z)s);
......
from __future__ import print_function, absolute_import, division from __future__ import print_function, absolute_import, division
import os import os
import theano
from theano.gof import Op, Apply from theano.gof import Op, Apply
from theano.gof.type import Generic from theano.gof.type import Generic
...@@ -125,11 +124,6 @@ class GpuMaxAndArgmax(Op): ...@@ -125,11 +124,6 @@ class GpuMaxAndArgmax(Op):
%(fail)s %(fail)s
} }
""" """
if theano.config.gpuarray.sync:
ret += """
GpuArray_sync(&%(max)s->ga);
GpuArray_sync(&%(argmax)s->ga);
"""
return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1], return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode, 'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
'name': name, 'fail': sub['fail']} 'name': name, 'fail': sub['fail']}
...@@ -141,4 +135,4 @@ class GpuMaxAndArgmax(Op): ...@@ -141,4 +135,4 @@ class GpuMaxAndArgmax(Op):
""" % {'name': name, 'X': inputs[0]} """ % {'name': name, 'X': inputs[0]}
def c_code_cache_version(self): def c_code_cache_version(self):
return (1, 1) return (2,)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论