提交 1414b2d2 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #6264 from notoraptor/remove-config-gpuarray-sync

Remove config.gpuarray.sync from Theano code.
......@@ -193,11 +193,14 @@ AddConfigVar(
in_c_key=False)
def deprecated_gpuarray_sync(val):
if val:
raise RuntimeError("Flag gpuarray.sync is deprecated and will be removed in next Theano release.")
return False
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
"""This flag is deprecated and will be removed in next Theano release.""",
ConfigParam(False, allow_override=False, filter=deprecated_gpuarray_sync),
in_c_key=True)
AddConfigVar('gpuarray.preallocate',
......
......@@ -849,8 +849,6 @@ class GpuAlloc(HideC, Alloc):
out[0][...] = v
else:
out[0][...] = v
if config.gpuarray.sync:
out[0].sync()
def c_code(self, node, name, inp, out, sub):
vv = inp[0]
......@@ -915,13 +913,10 @@ class GpuAlloc(HideC, Alloc):
""" % dict(name=name, ndim=ndim, zz=zz, vv=vv, ctx=sub['params'],
fail=sub['fail'], memset_0=memset_0)
if config.gpuarray.sync:
code += "GpuArray_sync(&%(zz)s->ga);" % dict(zz=zz)
return code
def c_code_cache_version(self):
return (3,)
return (4,)
def do_constant_folding(self, node):
from . import subtensor, blas
......@@ -1382,7 +1377,7 @@ class GpuSplit(HideC, Split):
# we reuse the perform of the CPU op, which is suitable
def c_code_cache_version(self):
return (1,)
return (2,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray_helper.h>']
......@@ -1514,13 +1509,6 @@ class GpuSplit(HideC, Split):
free(split_points);
"""
if config.gpuarray.sync:
main_code += """
for (i = 0; i < splits_count; ++i) {
GpuArray_sync(&((*outputs[i])->ga));
}
"""
return main_code % locals()
......@@ -1649,7 +1637,6 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
fail = sub['fail']
ctx = sub['params']
typecode = pygpu.gpuarray.dtype_to_typecode(self.dtype)
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
size_t dims[2] = {0, 0};
......@@ -1689,11 +1676,9 @@ KERNEL void eye(GLOBAL_MEM %(ctype)s *a, ga_size a_off,
}
}
if(%(sync)d)
GpuArray_sync(&%(z)s->ga);
""" % locals()
return s
def c_code_cache_version(self):
return (9,)
return (10,)
......@@ -3,7 +3,7 @@ import os.path
from six import integer_types
import theano
from theano import Apply, config, Op
from theano import Apply, Op
from theano.compile import optdb
from theano.gof import LocalOptGroup, ParamsType
......@@ -133,14 +133,11 @@ class GpuGemv(BlasOp):
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (9,)
return (10,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
......@@ -222,14 +219,11 @@ class GpuGemm(BlasOp):
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (6,)
return (7,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
......@@ -293,14 +287,11 @@ class GpuGer(BlasOp):
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (4,)
return (5,)
gpuger_no_inplace = GpuGer(inplace=False)
......@@ -361,14 +352,11 @@ class GpuDot22(BlasOp):
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (4,)
return (5,)
gpu_dot22 = GpuDot22()
......@@ -444,14 +432,11 @@ class GpuGemmBatch(BlasOp):
%(fail)s;
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
""" % vars
return code
def c_code_cache_version(self):
return (3,)
return (4,)
gpugemmbatch_no_inplace = GpuGemmBatch(inplace=False)
gpugemmbatch_inplace = GpuGemmBatch(inplace=True)
......@@ -549,7 +534,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
def c_code_cache_version(self):
# Raise this whenever modifying the C code (including the file).
return (9,)
return (10,)
def c_code_helper(self, bottom, weights, top, direction, sub, height=None, width=None):
"""
......@@ -633,16 +618,7 @@ class BaseGpuCorrMM(CGpuKernelBase):
if ((direction != 0) and (dW != 1)) or ((direction == 1) and (padW == -1)):
raise ValueError("width must be given for backprop with horizontal sampling or pad='half'")
width = '-1'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorrMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy()
sub.update(locals())
......@@ -831,8 +807,6 @@ class BaseGpuCorrMM(CGpuKernelBase):
}
assert (out2 == %(out)s);
%(sync)s
""" % sub
......@@ -1161,7 +1135,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
def c_code_cache_version(self):
# raise this whenever modifying the code below.
return (7,)
return (8,)
def c_code_helper(self, bottom, weights, top, direction, sub,
height=None, width=None, depth=None):
......@@ -1258,16 +1232,7 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
if ((direction != 0) and (dD != 1)) or ((direction == 1) and (padD == -1)):
raise ValueError("depth must be given for backprop with horizontal sampling or pad='half'")
depth = '-1'
sync = ""
if config.gpuarray.sync:
sync = """
int err = GpuArray_sync(&%(out)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"BaseGpuCorr3dMM error: gpuarray sync failed.");
%(fail)s;
}
""" % locals()
sub = sub.copy()
sub.update(locals())
......@@ -1490,8 +1455,6 @@ class BaseGpuCorr3dMM(CGpuKernelBase):
}
assert (out2 == %(out)s);
%(sync)s
""" % sub
......
......@@ -3,7 +3,7 @@ import copy
import numpy as np
import theano
from theano import Apply, scalar, config, Op
from theano import Apply, scalar, Op
from six.moves import StringIO, xrange
from theano.gof.utils import MethodNotDefined
from theano.scalar import Scalar, Composite
......@@ -371,18 +371,6 @@ class GpuElemwise(HideC, Elemwise):
}
""" % dict(fail=sub['fail'])
if config.gpuarray.sync:
z = outputs[0]
code += """
err = GpuArray_sync(&%(z)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: %%s.",
GpuArray_error(&%(z)s->ga, err));
%(fail)s;
}
""" % locals()
return str(code)
# To disable the superclass perform.
......@@ -395,7 +383,7 @@ class GpuElemwise(HideC, Elemwise):
def c_code_cache_version(self):
ver = self.scalar_op.c_code_cache_version()
if ver:
return (8, ver)
return (9, ver)
else:
return ver
......@@ -843,15 +831,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(err_check)s
""" % locals(), file=sio)
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
%(sync)s
""" % locals(), file=sio)
return sio.getvalue()
def _k_decl(self, node, nodename, pattern=None,
......@@ -1128,12 +1107,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
if(PyGpuArray_SIZE(%(x)s)==0){
......@@ -1155,7 +1129,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
size_t n_shared = sizeof(%(acc_dtype)s) * n_threads;
int err = GpuKernel_call(&%(k_var)s, 1, &n_blocks, &n_threads, n_shared, kernel_params);
%(err_check)s
%(sync)s
}
}
""" % locals(), file=sio)
......@@ -1272,12 +1245,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
int verbose = 0;
......@@ -1325,7 +1293,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s
%(sync)s
}else{
GpuKernel *%(k_var)s = &kernel_reduce_010_%(name)s;
size_t n_threads[3] = {std::min(PyGpuArray_DIMS(%(x)s)[0], (size_t) 256), 1, 1};
......@@ -1354,7 +1321,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(%(k_var)s, 3, n_blocks, n_threads, n_shared, kernel_params);
%(err_check)s
%(sync)s
}
}
""" % locals(), file=sio)
......@@ -1375,12 +1341,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
print("""
{
//int n_summations = PyGpuArray_DIMS(%(x)s)[0] * PyGpuArray_DIMS(%(x)s)[2];
......@@ -1429,7 +1389,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s
%(sync)s
}
else
{
......@@ -1470,7 +1429,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
);
%(makecall)s
}
%(sync)s
}
}
""" % locals(), file=sio)
......@@ -1506,12 +1464,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
# use threadIdx.x for i0
# use blockIdx.x for i1
# use blockIdx.y for i2
......@@ -1562,7 +1514,6 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
(void *)&stride_Z0, (void *)&stride_Z1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, n_threads, 0, kernel_params);
%(err_check)s
%(sync)s
}
}
""" % locals(), file=sio)
......@@ -1749,7 +1700,7 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
""" % locals(), file=sio)
def c_code_cache_version_apply(self, node):
version = [20] # the version corresponding to the c code in this Op
version = [21] # the version corresponding to the c code in this Op
# now we insert versions for the ops on which we depend...
scalar_node = Apply(
......@@ -2581,10 +2532,7 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(fail)s
}
if (%(sync)d)
GpuArray_sync(&%(out)s->ga);
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'],
sync=bool(config.gpuarray.sync))
""" % dict(out=out[0], inp=inp[0], fail=sub['fail'])
k = self.get_kernel_cache(node)
_, src, _, ls = k._get_basic_kernel(k.init_local_size,
node.inputs[0].ndim)
......@@ -2719,23 +2667,14 @@ class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
%(output)s = tmp;
}
if (%(sync)d) {
err = GpuArray_sync(&%(output)s->ga);
if (err != GA_NO_ERROR) {
PyErr_Format(PyExc_RuntimeError,
"gpuarray error: GpuCAReduceCPY: %%s.",
GpuKernel_error(&%(k_var)s, err));
%(fail)s
}
}
""" % dict(k_var='k_reduk_' + name, sync=bool(config.gpuarray.sync),
""" % dict(k_var='k_reduk_' + name,
ls=ls, fail=sub['fail'], output=output, input=input,
cast_out=bool(acc_dtype != node.outputs[0].type.dtype))
return code
def c_code_cache_version_apply(self, node):
return (2, self.kernel_version(node))
return (3, self.kernel_version(node))
def generate_kernel(self, node, odtype, redux):
if isinstance(self.scalar_op, scalar.basic.Add):
......
......@@ -10,7 +10,7 @@ except ImportError:
import theano
import theano.sandbox.multinomial
from theano import Apply, config
from theano import Apply
from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value
......@@ -137,7 +137,6 @@ KERNEL void k_multi_warp_multinomial(
out, = outputs
fail = sub['fail']
ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
out_typecode = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype)
in_typecode = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
......@@ -212,15 +211,14 @@ KERNEL void k_multi_warp_multinomial(
GpuKernel_error(&%(kname)s, err));
%(fail)s;
}
if(%(sync)d)
GpuArray_sync(&(out->ga));
} // END NESTED SCOPE
""" % locals()
return s
def c_code_cache_version(self):
return (5,)
return (6,)
class GPUAChoiceFromUniform(GpuKernelBase, Op):
......@@ -360,7 +358,6 @@ KERNEL void k_multi_warp_multinomial_wor(
replace = int(self.replace)
fail = sub['fail']
ctx = sub['params']
sync = bool(config.gpuarray.sync)
kname = self.gpu_kernels(node, name)[0].objvar
s = """
PyGpuArrayObject * pvals = %(pvals)s;
......@@ -447,15 +444,14 @@ KERNEL void k_multi_warp_multinomial_wor(
Py_DECREF(pvals_copy);
%(fail)s;
}
if(%(sync)d)
GpuArray_sync(&(out->ga));
Py_DECREF(pvals_copy);
} // END NESTED SCOPE
""" % locals()
return s
def c_code_cache_version(self):
return (9,)
return (10,)
@register_opt('fast_compile')
......
from __future__ import absolute_import, print_function, division
from theano import Op, Apply, config
from theano import Op, Apply
from theano.gof import ParamsType
from theano.tensor.nnet.neighbours import Images2Neibs
import theano.tensor as T
......@@ -45,7 +45,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
context_name=ten4.type.context_name)()])
def c_code_cache_version(self):
return (13,)
return (14,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -284,12 +284,7 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
%(fail)s;
}
""" % dict(fail=sub['fail'])
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % dict(z=out[0], err_check=err_check)
# NB: To reduce C code variability:
# For itemsize_ten4, I use GpuArray_ITEMSIZE(&ten4->ga) instead of np.dtype(node.inputs[0].dtype).itemsize
# For itemsize_z, I use itemsize_ten4, as ten4 and z have same type properties (deduced from make_node)
......@@ -563,13 +558,11 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
(void *)&%(z)s->ga.offset};
err = GpuKernel_call(fptr, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s
%(sync)s
} // END NESTED SCOPE
""" % dict(ten4=inp[0], neib_shape=inp[1], neib_step=inp[2], z=out[0],
dtype_neib_shape=node.inputs[1].dtype,
dtype_neib_step=node.inputs[2].dtype,
err_check=err_check,
sync=sync,
name=name,
params=sub['params'],
fail=sub['fail'])
......
......@@ -2,7 +2,7 @@ from __future__ import absolute_import, print_function, division
import os
import numpy as np
from theano import Op, Apply, config
from theano import Op, Apply
from six import StringIO
try:
......@@ -187,12 +187,6 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
sio = StringIO()
print("""
if (PyGpuArray_DIMS(%(x)s)[0] !=
......@@ -235,13 +229,12 @@ class GpuCrossentropySoftmaxArgmax1HotWithBias(GpuKernelBase, Op):
%(am)s->ga.data, %(am)s->ga.offset,
PyGpuArray_STRIDE(%(am)s, 0) / %(itemsize_am)s);
%(err_check)s
%(sync)s
}
""" % locals(), file=sio)
return sio.getvalue()
def c_code_cache_version(self):
return (13,)
return (14,)
gpu_crossentropy_softmax_argmax_1hot_with_bias = GpuCrossentropySoftmaxArgmax1HotWithBias()
......@@ -267,7 +260,7 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
return Apply(self, [dnll, sm, y_idx], [sm.type()])
def c_code_cache_version(self):
return (13,)
return (14,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -296,12 +289,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
%(err_check)s
""" % locals()
return """
// Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar.
const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ?
......@@ -378,7 +365,6 @@ class GpuCrossentropySoftmax1HotWithBiasDx(GpuKernelBase, Op):
(void *)&stride_DX0, (void *)&stride_DX1};
int err = GpuKernel_call(&%(k_var)s, 3, n_blocks, threads_per_block, 0, kernel_params);
%(err_check)s
%(sync)s
}
assert(%(dx)s);
""" % locals()
......@@ -465,7 +451,7 @@ class GpuSoftmax(GpuKernelBase, Op):
return shape
def c_code_cache_version(self):
return (16,)
return (17,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -487,15 +473,6 @@ class GpuSoftmax(GpuKernelBase, Op):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
else:
sync = ""
return """
if (PyGpuArray_NDIM(%(x)s) != 2)
{
......@@ -555,7 +532,6 @@ class GpuSoftmax(GpuKernelBase, Op):
msg = GpuKernel_error(&kSoftmax_fixed_shared%(nodename)s, err);
}
%(err_check)s
%(sync)s
}
}
assert(%(z)s);
......@@ -751,7 +727,7 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
return [shape[0]]
def c_code_cache_version(self):
return (15,)
return (16,)
def c_headers(self):
return ['<numpy_compat.h>', '<gpuarray/types.h>']
......@@ -775,13 +751,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
%(fail)s;
}
""" % locals()
sync = ""
if config.gpuarray.sync:
sync = """
err = GpuArray_sync(&%(z)s->ga);
msg = "sync error";
%(err_check)s
""" % locals()
return """
if (PyGpuArray_NDIM(%(x)s) != 2)
{
......@@ -856,7 +825,6 @@ class GpuSoftmaxWithBias(GpuKernelBase, Op):
msg = GpuKernel_error(&kSoftmaxWithBias_fixed_shared%(nodename)s, err);
}
%(err_check)s
%(sync)s
}
}
assert(%(z)s);
......
from __future__ import print_function, absolute_import, division
import os
import theano
from theano.gof import Op, Apply
from theano.gof.type import Generic
......@@ -125,11 +124,6 @@ class GpuMaxAndArgmax(Op):
%(fail)s
}
"""
if theano.config.gpuarray.sync:
ret += """
GpuArray_sync(&%(max)s->ga);
GpuArray_sync(&%(argmax)s->ga);
"""
return ret % {'X': input_names[0], 'axes': sub['params'], 'max': output_names[0], 'argmax': output_names[1],
'max_typecode': max_typecode, 'argmax_typecode': argmax_typecode,
'name': name, 'fail': sub['fail']}
......@@ -141,4 +135,4 @@ class GpuMaxAndArgmax(Op):
""" % {'name': name, 'X': inputs[0]}
def c_code_cache_version(self):
return (1, 1)
return (2,)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论