提交 0c53fb52 authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #5579 from ReyhaneAskari/CleanUp

Clean up
...@@ -73,7 +73,7 @@ def as_gpuarray_variable(x, context_name): ...@@ -73,7 +73,7 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor # If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return gpu_from_host(context_name)(x) return GpuFromHost(context_name)(x)
# Try _as_GpuArrayVariable if possible # Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
...@@ -617,7 +617,7 @@ class HostFromGpu(Op): ...@@ -617,7 +617,7 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [gpu_from_host(inputs[0].type.context_name)(gz)] return [GpuFromHost(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
...@@ -663,8 +663,8 @@ class GpuFromHost(Op): ...@@ -663,8 +663,8 @@ class GpuFromHost(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [host_from_gpu(as_gpuarray_variable( return [as_gpuarray_variable(
gz, context_name=self.context_name))] gz, context_name=self.context_name).transfer('cpu')]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
...@@ -722,14 +722,6 @@ class GpuFromHost(Op): ...@@ -722,14 +722,6 @@ class GpuFromHost(Op):
return (9,) return (9,)
# Caching GPUAlloc
def gpu_from_host(ctx):
if ctx not in gpu_alloc.cache:
gpu_from_host.cache[ctx] = GpuFromHost(ctx)
return gpu_from_host.cache[ctx]
gpu_from_host.cache = {}
class GpuToGpu(Op): class GpuToGpu(Op):
""" """
Transfer data between GPUs. Transfer data between GPUs.
...@@ -953,15 +945,6 @@ class GpuAlloc(HideC, Alloc): ...@@ -953,15 +945,6 @@ class GpuAlloc(HideC, Alloc):
return True return True
# Caching GPUAlloc
def gpu_alloc(ctx, memset_0=False):
key = (ctx, memset_0)
if key not in gpu_alloc.cache:
gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
return gpu_alloc.cache[key]
gpu_alloc.cache = {}
class GpuAllocEmpty(HideC, AllocEmpty): class GpuAllocEmpty(HideC, AllocEmpty):
""" """
Allocate uninitialized memory on the GPU. Allocate uninitialized memory on the GPU.
...@@ -1048,14 +1031,6 @@ def empty_like(var): ...@@ -1048,14 +1031,6 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(ctx, dtype):
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
return gpu_alloc_empty.cache[key]
gpu_alloc_empty.cache = {}
class GpuContiguous(Op): class GpuContiguous(Op):
""" """
Return a C contiguous version of the input. Return a C contiguous version of the input.
...@@ -1132,7 +1107,7 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -1132,7 +1107,7 @@ class GpuReshape(HideC, tensor.Reshape):
ctx_name = infer_context_name(x) ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name) x = as_gpuarray_variable(x, context_name=ctx_name)
shp = tensor.as_tensor_variable(shp) shp = tensor.as_tensor_variable(shp)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim) res = x.transfer('cpu').reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype, otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable, broadcastable=res.broadcastable,
context_name=ctx_name) context_name=ctx_name)
......
...@@ -32,7 +32,7 @@ from . import pygpu ...@@ -32,7 +32,7 @@ from . import pygpu
from .type import (get_context, gpu_context_type, list_contexts, from .type import (get_context, gpu_context_type, list_contexts,
GpuArraySharedVariable) GpuArraySharedVariable)
from .basic_ops import (as_gpuarray_variable, infer_context_name, from .basic_ops import (as_gpuarray_variable, infer_context_name,
gpu_contiguous, gpu_alloc_empty, gpu_contiguous, GpuAllocEmpty,
empty_like, GpuArrayType, HostFromGpu) empty_like, GpuArrayType, HostFromGpu)
from .elemwise import GpuElemwise from .elemwise import GpuElemwise
...@@ -466,18 +466,6 @@ class GpuDnnConvDesc(COp): ...@@ -466,18 +466,6 @@ class GpuDnnConvDesc(COp):
return (super(GpuDnnConvDesc, self).c_code_cache_version(), version()) return (super(GpuDnnConvDesc, self).c_code_cache_version(), version())
def gpu_dnn_conv_desc(border_mode, subsample=(1, 1), conv_mode='conv',
precision="float32"):
key = (border_mode, subsample, conv_mode, precision)
if key not in gpu_dnn_conv_desc.cache:
gpu_dnn_conv_desc.cache[key] = GpuDnnConvDesc(border_mode,
subsample,
conv_mode,
precision)
return gpu_dnn_conv_desc.cache[key]
gpu_dnn_conv_desc.cache = {}
# scalar constants # scalar constants
_zero = constant(np.asarray(0.0, dtype='float64')) _zero = constant(np.asarray(0.0, dtype='float64'))
_one = constant(np.asarray(1.0, dtype='float64')) _one = constant(np.asarray(1.0, dtype='float64'))
...@@ -613,8 +601,8 @@ class GpuDnnConv(DnnBase): ...@@ -613,8 +601,8 @@ class GpuDnnConv(DnnBase):
top = gpu_contiguous(top) top = gpu_contiguous(top)
d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc) d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc) d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -651,14 +639,6 @@ class GpuDnnConv(DnnBase): ...@@ -651,14 +639,6 @@ class GpuDnnConv(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv.cache:
gpu_dnn_conv.cache[key] = GpuDnnConv(algo, inplace)
return gpu_dnn_conv.cache[key]
gpu_dnn_conv.cache = {}
class GpuDnnConvGradW(DnnBase): class GpuDnnConvGradW(DnnBase):
""" """
...@@ -703,8 +683,8 @@ class GpuDnnConvGradW(DnnBase): ...@@ -703,8 +683,8 @@ class GpuDnnConvGradW(DnnBase):
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
d_img = gpu_dnn_conv_gradI()(kerns, top, empty_like(img), desc) d_img = GpuDnnConvGradI()(kerns, top, empty_like(img), desc)
d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc) d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -790,14 +770,6 @@ class GpuDnnConvGradW(DnnBase): ...@@ -790,14 +770,6 @@ class GpuDnnConvGradW(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv_gradW(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradW.cache:
gpu_dnn_conv_gradW.cache[key] = GpuDnnConvGradW(inplace, algo)
return gpu_dnn_conv_gradW.cache[key]
gpu_dnn_conv_gradW.cache = {}
class GpuDnnConvGradI(DnnBase): class GpuDnnConvGradI(DnnBase):
""" """
The convolution gradient with respect to the inputs. The convolution gradient with respect to the inputs.
...@@ -843,8 +815,8 @@ class GpuDnnConvGradI(DnnBase): ...@@ -843,8 +815,8 @@ class GpuDnnConvGradI(DnnBase):
img = gpu_contiguous(img) img = gpu_contiguous(img)
d_kerns = gpu_dnn_conv_gradW()(img, top, empty_like(kerns), desc) d_kerns = GpuDnnConvGradW()(img, top, empty_like(kerns), desc)
d_top = gpu_dnn_conv()(img, kerns, empty_like(top), desc) d_top = GpuDnnConv()(img, kerns, empty_like(top), desc)
d_alpha = grad_not_implemented(self, 4, alpha) d_alpha = grad_not_implemented(self, 4, alpha)
d_beta = grad_not_implemented(self, 5, beta) d_beta = grad_not_implemented(self, 5, beta)
...@@ -920,14 +892,6 @@ class GpuDnnConvGradI(DnnBase): ...@@ -920,14 +892,6 @@ class GpuDnnConvGradI(DnnBase):
return [shape[2]] return [shape[2]]
def gpu_dnn_conv_gradI(algo=None, inplace=False):
key = (algo, inplace)
if key not in gpu_dnn_conv_gradI.cache:
gpu_dnn_conv_gradI.cache[key] = GpuDnnConvGradI(inplace, algo)
return gpu_dnn_conv_gradI.cache[key]
gpu_dnn_conv_gradI.cache = {}
def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
conv_mode='conv', direction_hint=None, workmem=None, conv_mode='conv', direction_hint=None, workmem=None,
algo=None, precision=None): algo=None, precision=None):
...@@ -1002,10 +966,10 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -1002,10 +966,10 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1, shape_i(img, 2, fgraph) - shape_i(kerns, 2, fgraph) + 1,
shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1) shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode='cross', precision=precision)(out.shape) conv_mode='cross', precision=precision)(out.shape)
conv = gpu_dnn_conv_gradW()(img, kerns, out, desc) conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1) and elif (border_mode == 'full' and subsample == (1, 1) and
...@@ -1021,17 +985,17 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -1021,17 +985,17 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1, shape_i(img, 2, fgraph) + shape_i(kerns, 2, fgraph) - 1,
shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1) shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1),
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
return gpu_dnn_conv_gradI()(kerns, img, out, desc) return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy # contig_version will return a gpu_contiguous copy
# if the img contains negative strides # if the img contains negative strides
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
# We can use Shape_i and bypass the infer_shape here as this is on # We can use Shape_i and bypass the infer_shape here as this is on
...@@ -1042,8 +1006,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1), ...@@ -1042,8 +1006,8 @@ def dnn_conv(img, kerns, border_mode='valid', subsample=(1, 1),
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
...@@ -1114,10 +1078,10 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), ...@@ -1114,10 +1078,10 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1, shape_i(img, 3, fgraph) - shape_i(kerns, 3, fgraph) + 1,
shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1) shape_i(img, 4, fgraph) - shape_i(kerns, 4, fgraph) + 1)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode='cross', precision=precision)(out.shape) conv_mode='cross', precision=precision)(out.shape)
conv = gpu_dnn_conv_gradW()(img, kerns, out, desc) conv = GpuDnnConvGradW()(img, kerns, out, desc)
return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name) return as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3, 4), ctx_name)
elif (border_mode == 'full' and subsample == (1, 1, 1) and elif (border_mode == 'full' and subsample == (1, 1, 1) and
...@@ -1134,17 +1098,17 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), ...@@ -1134,17 +1098,17 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1, shape_i(img, 3, fgraph) + shape_i(kerns, 3, fgraph) - 1,
shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1) shape_i(img, 4, fgraph) + shape_i(kerns, 4, fgraph) - 1)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1), desc = GpuDnnConvDesc(border_mode='valid', subsample=(1, 1, 1),
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
return gpu_dnn_conv_gradI()(kerns, img, out, desc) return GpuDnnConvGradI()(kerns, img, out, desc)
# Standard case: We use GpuDnnConv with suitable padding. # Standard case: We use GpuDnnConv with suitable padding.
# contig_version will return a gpu_contiguous copy # contig_version will return a gpu_contiguous copy
# if the img contains negative strides # if the img contains negative strides
img = gpu_contiguous(img) img = gpu_contiguous(img)
kerns = gpu_contiguous(kerns) kerns = gpu_contiguous(kerns)
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)(kerns.shape) conv_mode=conv_mode, precision=precision)(kerns.shape)
desc_op = desc.owner.op desc_op = desc.owner.op
# We can use Shape_i and bypass the infer_shape here as this is on # We can use Shape_i and bypass the infer_shape here as this is on
...@@ -1155,8 +1119,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1), ...@@ -1155,8 +1119,8 @@ def dnn_conv3d(img, kerns, border_mode='valid', subsample=(1, 1, 1),
desc_op.border_mode, desc_op.border_mode,
desc_op.subsample) desc_op.subsample)
out_shp = assert_conv_shape(out_shp) out_shp = assert_conv_shape(out_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*out_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*out_shp)
return gpu_dnn_conv(algo=algo)(img, kerns, out, desc) return GpuDnnConv(algo=algo)(img, kerns, out, desc)
def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
...@@ -1172,11 +1136,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid', ...@@ -1172,11 +1136,10 @@ def dnn_gradweight(img, topgrad, kerns_shp, border_mode='valid',
kerns_shp = as_tensor_variable(kerns_shp) kerns_shp = as_tensor_variable(kerns_shp)
precision = get_precision(precision, [img, topgrad]) precision = get_precision(precision, [img, topgrad])
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)( conv_mode=conv_mode, precision=precision)(kerns_shp)
kerns_shp) out = GpuAllocEmpty(dtype=img.dtype, context_name=ctx_name)(*kerns_shp)
out = gpu_alloc_empty(ctx_name, dtype=img.dtype)(*kerns_shp) return GpuDnnConvGradW()(img, topgrad, out, desc)
return gpu_dnn_conv_gradW()(img, topgrad, out, desc)
def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid', def dnn_gradweight3d(img, topgrad, kerns_shp, border_mode='valid',
...@@ -1201,11 +1164,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid', ...@@ -1201,11 +1164,10 @@ def dnn_gradinput(kerns, topgrad, img_shp, border_mode='valid',
img_shp = as_tensor_variable(img_shp) img_shp = as_tensor_variable(img_shp)
precision = get_precision(precision, [kerns, topgrad]) precision = get_precision(precision, [kerns, topgrad])
desc = gpu_dnn_conv_desc(border_mode=border_mode, subsample=subsample, desc = GpuDnnConvDesc(border_mode=border_mode, subsample=subsample,
conv_mode=conv_mode, precision=precision)( conv_mode=conv_mode, precision=precision)(kerns.shape)
kerns.shape) out = GpuAllocEmpty(dtype=kerns.dtype, context_name=ctx_name)(*img_shp)
out = gpu_alloc_empty(ctx_name, kerns.dtype)(*img_shp) return GpuDnnConvGradI()(kerns, topgrad, out, desc)
return gpu_dnn_conv_gradI()(kerns, topgrad, out, desc)
def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid', def dnn_gradinput3d(kerns, topgrad, img_shp, border_mode='valid',
...@@ -2849,17 +2811,17 @@ def local_abstractconv_gi_cudnn(node): ...@@ -2849,17 +2811,17 @@ def local_abstractconv_gi_cudnn(node):
@inplace_allocempty(GpuDnnConv, 2) @inplace_allocempty(GpuDnnConv, 2)
def local_dnn_conv_inplace(node, inputs): def local_dnn_conv_inplace(node, inputs):
return [gpu_dnn_conv(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradW, 2) @inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs): def local_dnn_convgw_inplace(node, inputs):
return [gpu_dnn_conv_gradW(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]
@inplace_allocempty(GpuDnnConvGradI, 2) @inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs): def local_dnn_convgi_inplace(node, inputs):
return [gpu_dnn_conv_gradI(algo=node.op.algo, inplace=True)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]
optdb.register('local_dnna_conv_inplace', optdb.register('local_dnna_conv_inplace',
tensor.opt.in2out(local_dnn_conv_inplace, tensor.opt.in2out(local_dnn_conv_inplace,
...@@ -2872,40 +2834,40 @@ optdb.register('local_dnna_conv_inplace', ...@@ -2872,40 +2834,40 @@ optdb.register('local_dnna_conv_inplace',
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs): def local_dnn_conv_alpha_merge(node, *inputs):
return [gpu_dnn_conv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs): def local_dnn_convw_alpha_merge(node, *inputs):
return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5) @alpha_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5)
def local_dnn_convi_alpha_merge(node, *inputs): def local_dnn_convi_alpha_merge(node, *inputs):
return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConv, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_conv_output_merge(node, *inputs): def local_dnn_conv_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [gpu_dnn_conv(algo=node.op.algo)(*inputs)] return [GpuDnnConv(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convw_output_merge(node, *inputs): def local_dnn_convw_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [gpu_dnn_conv_gradW(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
@register_opt('cudnn') @register_opt('cudnn')
@output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2) @output_merge(GpuDnnConvGradI, alpha_in=4, beta_in=5, out_in=2)
def local_dnn_convi_output_merge(node, *inputs): def local_dnn_convi_output_merge(node, *inputs):
inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:] inputs = inputs[0:2] + (gpu_contiguous(inputs[2]),) + inputs[3:]
return [gpu_dnn_conv_gradI(algo=node.op.algo)(*inputs)] return [GpuDnnConvGradI(algo=node.op.algo)(*inputs)]
def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs): def local_gpua_pool_dnn_alternative(op, ctx_name, inputs, outputs):
......
...@@ -2,13 +2,13 @@ from __future__ import absolute_import, print_function, division ...@@ -2,13 +2,13 @@ from __future__ import absolute_import, print_function, division
import os import os
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumOp from theano.tensor.extra_ops import CumOp
from .basic_ops import infer_context_name
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape) from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape, infer_context_name)
from .opt import register_opt, op_lifter, register_opt2 from .opt import register_opt, op_lifter, register_opt2
......
...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant ...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
from . import opt from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty, from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name, gpu_alloc_empty) infer_context_name)
from .type import gpu_context_type from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
...@@ -158,7 +158,7 @@ def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs): ...@@ -158,7 +158,7 @@ def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = getattr(outputs[0], 'fgraph', None) fgraph = getattr(outputs[0], 'fgraph', None)
C = gpu_alloc_empty(ctx_name, dtype='float16')( C = GpuAllocEmpty('float16', ctx_name)(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
...@@ -44,8 +44,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name, ...@@ -44,8 +44,7 @@ from .basic_ops import (as_gpuarray_variable, infer_context_name,
HostFromGpu, GpuFromHost, HostFromGpu, GpuFromHost,
GpuSplit, GpuContiguous, gpu_contiguous, GpuSplit, GpuContiguous, gpu_contiguous,
GpuAlloc, GpuAllocEmpty, GpuReshape, GpuAlloc, GpuAllocEmpty, GpuReshape,
GpuEye, gpu_join, GpuJoin, gpu_alloc_empty, GpuEye, gpu_join, GpuJoin)
gpu_alloc, gpu_from_host)
from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch, from .blas import (gpu_dot22, GpuGemm, GpuGer, GpuGemmBatch,
gpugemm_no_inplace, gpugemm_inplace, gpugemm_no_inplace, gpugemm_inplace,
gpugemmbatch_no_inplace, gpugemmbatch_no_inplace,
...@@ -61,9 +60,8 @@ from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter, ...@@ -61,9 +60,8 @@ from .blocksparse import (GpuSparseBlockGemv, GpuSparseBlockOuter,
from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx, from .nnet import (gpu_crossentropy_softmax_1hot_with_bias_dx,
gpu_crossentropy_softmax_argmax_1hot_with_bias, gpu_crossentropy_softmax_argmax_1hot_with_bias,
gpu_softmax_with_bias, gpu_softmax) gpu_softmax_with_bias, gpu_softmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
GpuCAReduceCPY, gpu_ca_reduce_cuda, gpu_erfinv, gpu_erfcinv, GpuCAReduceCPY, gpu_erfinv, gpu_erfcinv,
max_inputs_to_GpuElemwise) max_inputs_to_GpuElemwise)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, from .subtensor import (GpuIncSubtensor, GpuSubtensor,
GpuAdvancedSubtensor, GpuAdvancedSubtensor,
...@@ -165,14 +163,14 @@ gpu_optimizer.register('local_remove_all_assert', ...@@ -165,14 +163,14 @@ gpu_optimizer.register('local_remove_all_assert',
def safe_to_gpu(x, ctx_name): def safe_to_gpu(x, ctx_name):
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return gpu_from_host(ctx_name)(x) return GpuFromHost(ctx_name)(x)
else: else:
return x return x
def safe_to_cpu(x): def safe_to_cpu(x):
if isinstance(x.type, GpuArrayType): if isinstance(x.type, GpuArrayType):
return host_from_gpu(x) return x.transfer('cpu')
else: else:
return x return x
...@@ -236,7 +234,7 @@ def op_lifter(OP, cuda_only=False): ...@@ -236,7 +234,7 @@ def op_lifter(OP, cuda_only=False):
elif isinstance(new_op, (tuple, list)): elif isinstance(new_op, (tuple, list)):
return [safe_to_cpu(o) for o in new_op] return [safe_to_cpu(o) for o in new_op]
else: # suppose it is a variable on the GPU else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)] return [new_op.transfer('cpu')]
return False return False
local_opt.__name__ = maker.__name__ local_opt.__name__ = maker.__name__
return local_optimizer(OP)(local_opt) return local_optimizer(OP)(local_opt)
...@@ -269,7 +267,7 @@ class InputToGpuOptimizer(Optimizer): ...@@ -269,7 +267,7 @@ class InputToGpuOptimizer(Optimizer):
continue continue
try: try:
new_input = host_from_gpu(gpu_from_host(target)(input)) new_input = GpuFromHost(target)(input).transfer('cpu')
fgraph.replace_validate(input, new_input, fgraph.replace_validate(input, new_input,
"InputToGpuOptimizer") "InputToGpuOptimizer")
except TypeError: except TypeError:
...@@ -546,7 +544,7 @@ def local_cut_gpu_transfers(node): ...@@ -546,7 +544,7 @@ def local_cut_gpu_transfers(node):
# gpub -> # gpub ->
if isinstance(n2.op, GpuToGpu): if isinstance(n2.op, GpuToGpu):
return [host_from_gpu(n2.inputs[0])] return [n2.inputs[0].transfer('cpu')]
# ? -> gpua -> gpub # ? -> gpua -> gpub
elif isinstance(node.op, GpuToGpu): elif isinstance(node.op, GpuToGpu):
...@@ -600,14 +598,14 @@ def local_gpua_alloc2(node): ...@@ -600,14 +598,14 @@ def local_gpua_alloc2(node):
i.owner.op in [host_from_gpu, tensor.alloc] i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]) for i in c.inputs[1:])
for c, idx in node.outputs[0].clients)): for c, idx in node.outputs[0].clients)):
return [host_from_gpu(gpu_alloc(None)(*node.inputs))] return [GpuAlloc(None)(*node.inputs).transfer('cpu')]
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([tensor.Alloc]) @op_lifter([tensor.Alloc])
@register_opt2([tensor.Alloc], 'fast_compile') @register_opt2([tensor.Alloc], 'fast_compile')
def local_gpua_alloc(op, context_name, inputs, outputs): def local_gpuaalloc(op, context_name, inputs, outputs):
return gpu_alloc(context_name) return GpuAlloc(context_name)(*inputs)
@register_opt('fast_compile') @register_opt('fast_compile')
...@@ -616,7 +614,7 @@ def local_gpua_alloc(op, context_name, inputs, outputs): ...@@ -616,7 +614,7 @@ def local_gpua_alloc(op, context_name, inputs, outputs):
def local_gpua_alloc_empty(op, context_name, inputs, outputs): def local_gpua_alloc_empty(op, context_name, inputs, outputs):
# We use _props_dict() to make sure that the GPU op know all the # We use _props_dict() to make sure that the GPU op know all the
# CPU op props. # CPU op props.
return gpu_alloc_empty(context_name, **op._props_dict()) return GpuAllocEmpty(context_name=context_name, **op._props_dict())(*inputs)
@register_opt() @register_opt()
...@@ -627,7 +625,7 @@ def local_gpualloc_memset_0(node): ...@@ -627,7 +625,7 @@ def local_gpualloc_memset_0(node):
if (isinstance(inp, GpuArrayConstant) and if (isinstance(inp, GpuArrayConstant) and
inp.data.size == 1 and inp.data.size == 1 and
(np.asarray(inp.data) == 0).all()): (np.asarray(inp.data) == 0).all()):
new_op = gpu_alloc(node.op.context_name, memset_0=True) new_op = GpuAlloc(node.op.context_name, memset_0=True)
return [new_op(*node.inputs)] return [new_op(*node.inputs)]
...@@ -637,7 +635,7 @@ def local_gpua_alloc_empty_to_zeros(node): ...@@ -637,7 +635,7 @@ def local_gpua_alloc_empty_to_zeros(node):
if isinstance(node.op, GpuAllocEmpty): if isinstance(node.op, GpuAllocEmpty):
context_name = infer_context_name(*node.inputs) context_name = infer_context_name(*node.inputs)
z = np.asarray(0, dtype=node.outputs[0].dtype) z = np.asarray(0, dtype=node.outputs[0].dtype)
return [gpu_alloc(context_name)(as_gpuarray_variable(z, context_name), return [GpuAlloc(context_name)(as_gpuarray_variable(z, context_name),
*node.inputs)] *node.inputs)]
optdb.register('local_gpua_alloc_empty_to_zeros', optdb.register('local_gpua_alloc_empty_to_zeros',
theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros), theano.tensor.opt.in2out(local_gpua_alloc_empty_to_zeros),
...@@ -918,7 +916,7 @@ def local_gpu_pdbbreakpoint_op(node): ...@@ -918,7 +916,7 @@ def local_gpu_pdbbreakpoint_op(node):
new_outputs = [] new_outputs = []
for i in range(len(new_op_outputs)): for i in range(len(new_op_outputs)):
if input_transfered[i]: if input_transfered[i]:
new_outputs.append(host_from_gpu(new_op_outputs[i])) new_outputs.append(new_op_outputs[i].transfer('cpu'))
else: else:
new_outputs.append(new_op_outputs[i]) new_outputs.append(new_op_outputs[i])
...@@ -983,7 +981,7 @@ def local_gpua_subtensor(op, context_name, inputs, outputs): ...@@ -983,7 +981,7 @@ def local_gpua_subtensor(op, context_name, inputs, outputs):
for n, _ in outputs[0].clients]): for n, _ in outputs[0].clients]):
return return
else: else:
return [host_from_gpu(gpu_x.owner.op(outputs[0]))] return [gpu_x.owner.op(outputs[0]).transfer('cpu')]
return GpuSubtensor(op.idx_list) return GpuSubtensor(op.idx_list)
...@@ -1234,7 +1232,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs): ...@@ -1234,7 +1232,7 @@ def local_gpua_dot22scalar(op, context_name, inputs, outputs):
x, y, a = inputs x, y, a = inputs
x = as_gpuarray_variable(x, context_name) x = as_gpuarray_variable(x, context_name)
y = as_gpuarray_variable(y, context_name) y = as_gpuarray_variable(y, context_name)
z = gpu_alloc_empty(context_name, dtype=x.dtype)(x.shape[0], y.shape[1]) z = GpuAllocEmpty(x.dtype, context_name)(x.shape[0], y.shape[1])
return [gpugemm_no_inplace(z, a, x, y, 0)] return [gpugemm_no_inplace(z, a, x, y, 0)]
...@@ -1804,7 +1802,7 @@ def local_gpu_elemwise_careduce(node): ...@@ -1804,7 +1802,7 @@ def local_gpu_elemwise_careduce(node):
isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)): isinstance(node.inputs[0].owner.op.scalar_op, scalar.basic.Sqr)):
op = node.op op = node.op
inp = node.inputs[0].owner.inputs[0] inp = node.inputs[0].owner.inputs[0]
return [gpu_ca_reduce_cuda(scalar_op=op.scalar_op, return [GpuCAReduceCuda(scalar_op=op.scalar_op,
axis=op.axis, axis=op.axis,
reduce_mask=op.reduce_mask, reduce_mask=op.reduce_mask,
pre_scalar_op=scalar.basic.sqr)(inp)] pre_scalar_op=scalar.basic.sqr)(inp)]
......
...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer ...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape, gpu_alloc_empty from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, GpuReshape
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(np.asarray(1.0, dtype='float32')) _one = scal.constant(np.asarray(1.0, dtype='float32'))
...@@ -324,7 +324,7 @@ def inplace_allocempty(op, idx): ...@@ -324,7 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype) alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) return maker(node, inputs)
return opt return opt
......
...@@ -271,7 +271,7 @@ class GpuArrayType(Type): ...@@ -271,7 +271,7 @@ class GpuArrayType(Type):
return data return data
def filter_variable(self, other, allow_convert=True): def filter_variable(self, other, allow_convert=True):
from theano.gpuarray.basic_ops import gpu_from_host from theano.gpuarray.basic_ops import GpuFromHost
if hasattr(other, '_as_GpuArrayVariable'): if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable(self.context_name) other = other._as_GpuArrayVariable(self.context_name)
...@@ -303,7 +303,7 @@ class GpuArrayType(Type): ...@@ -303,7 +303,7 @@ class GpuArrayType(Type):
str(self.broadcastable))) str(self.broadcastable)))
other = other2 other = other2
return gpu_from_host(self.context_name)(other) return GpuFromHost(self.context_name)(other)
@staticmethod @staticmethod
def values_eq(a, b, force_same_dtype=True): def values_eq(a, b, force_same_dtype=True):
......
...@@ -9,7 +9,7 @@ import theano ...@@ -9,7 +9,7 @@ import theano
y = theano.tensor.fvector() y = theano.tensor.fvector()
x = theano.shared(np.zeros(1, dtype='float32')) x = theano.shared(np.zeros(1, dtype='float32'))
f1 = theano.function([y], updates={x: y}) f1 = theano.function([y], updates={x: y})
f2 = theano.function([], theano.sandbox.cuda.host_from_gpu(x)) f2 = theano.function([], x.transfer('cpu'))
print(f1.maker.fgraph.toposort()) print(f1.maker.fgraph.toposort())
print(f2.maker.fgraph.toposort()) print(f2.maker.fgraph.toposort())
for i in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]: for i in [1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]:
......
...@@ -29,8 +29,7 @@ from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name, ...@@ -29,8 +29,7 @@ from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name,
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua, from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2, register_opt2)
host_from_gpu as host_from_gpua)
if theano.sandbox.cuda.cuda_available: if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
float32_shared_constructor) float32_shared_constructor)
...@@ -1621,7 +1620,7 @@ def local_gpua_mrg_graph(op, context_name, inputs, outputs): ...@@ -1621,7 +1620,7 @@ def local_gpua_mrg_graph(op, context_name, inputs, outputs):
op.output_type.ndim, op.output_type.ndim,
op.output_type.dtype, op.output_type.dtype,
inputs[1]) inputs[1])
return [outs[0], host_from_gpua(outs[1])] return [outs[0], outs[1].transfer('cpu')]
@register_gpua('fast_compile') @register_gpua('fast_compile')
......
...@@ -152,7 +152,7 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -152,7 +152,7 @@ def traverse(out, x, x_copy, d, visited=None):
return d return d
visited.add(out) visited.add(out)
from theano.sandbox import cuda from theano.sandbox import cuda
from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu from theano.gpuarray.basic_ops import GpuFromHost, host_from_gpu
from theano.gpuarray import pygpu_activated from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
if out == x: if out == x:
...@@ -160,7 +160,7 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -160,7 +160,7 @@ def traverse(out, x, x_copy, d, visited=None):
d[out] = cuda.gpu_from_host(x_copy) d[out] = cuda.gpu_from_host(x_copy)
else: else:
assert isinstance(x.type, GpuArrayType) assert isinstance(x.type, GpuArrayType)
d[out] = gpu_from_host(x.type.context_name)(x_copy) d[out] = GpuFromHost(x.type.context_name)(x_copy)
return d return d
elif out.owner is None: elif out.owner is None:
return d return d
......
...@@ -332,7 +332,7 @@ def make_gpu_optimizer(op, to_gpu): ...@@ -332,7 +332,7 @@ def make_gpu_optimizer(op, to_gpu):
new_inp[idx] = cuda.gpu_from_host(new_inp[idx]) new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
result_node = op()(*new_inp) result_node = op()(*new_inp)
copy_stack_trace(node.outputs[0], result_node) copy_stack_trace(node.outputs[0], result_node)
transfer_node = cuda.host_from_gpu(result_node) transfer_node = result_node.transfer('cpu')
copy_stack_trace(node.outputs[0], transfer_node) copy_stack_trace(node.outputs[0], transfer_node)
return [transfer_node] return [transfer_node]
if node.op == cuda.gpu_from_host: if node.op == cuda.gpu_from_host:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论