提交 a24fd9bb authored 作者: Frédéric Bastien's avatar Frédéric Bastien 提交者: GitHub

Merge pull request #4570 from Sentient07/new_graph2gpu

New graph2gpu
...@@ -402,6 +402,14 @@ class Shape_i(gof.Op): ...@@ -402,6 +402,14 @@ class Shape_i(gof.Op):
def infer_shape(self, node, input_shapes): def infer_shape(self, node, input_shapes):
return [()] return [()]
def connection_pattern(self, node):
# the grad returns the gradient with respect to the
# elements of a tensor variable
# the elements of the tensor variable do not participate
# in the computation of the shape, so they are not really
# part of the graph
return [[False]]
def grad(self, inp, grads): def grad(self, inp, grads):
return [theano.gradient.grad_not_implemented( return [theano.gradient.grad_not_implemented(
op=self, x_pos=0, x=inp[0], op=self, x_pos=0, x=inp[0],
...@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None): ...@@ -455,6 +463,14 @@ def shape_i(var, i, fgraph=None):
return var.shape[i] return var.shape[i]
def shape_i_op(i):
key = i
if key not in shape_i_op.cache:
shape_i_op.cache[key] = Shape_i(i)
return shape_i_op.cache[key]
shape_i_op.cache = {}
def register_shape_i_c_code(typ, code, check_input, version=()): def register_shape_i_c_code(typ, code, check_input, version=()):
""" """
Tell Shape_i how to generate C code for a Theano Type. Tell Shape_i how to generate C code for a Theano Type.
......
...@@ -54,7 +54,7 @@ def _atexit_print_fn(): ...@@ -54,7 +54,7 @@ def _atexit_print_fn():
destination_file = open(config.profiling.destination, 'w') destination_file = open(config.profiling.destination, 'w')
for ps in _atexit_print_list: for ps in _atexit_print_list:
if ps.fct_callcount or ps.compile_time > 0: if ps.fct_callcount >= 1 or ps.compile_time > 1:
ps.summary(file=destination_file, ps.summary(file=destination_file,
n_ops_to_print=config.profiling.n_ops, n_ops_to_print=config.profiling.n_ops,
n_apply_to_print=config.profiling.n_apply) n_apply_to_print=config.profiling.n_apply)
......
...@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer): ...@@ -2413,7 +2413,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
for (t, count, n_created, o) in count_opt[::-1]: for (t, count, n_created, o) in count_opt[::-1]:
print(blanc, ' %.3fs - %d - %d - %s' % ( print(blanc, ' %.3fs - %d - %d - %s' % (
t, count, n_created, o), file=stream) t, count, n_created, o), file=stream)
print(blanc, ' %.3fs - in %d optimization that where not used (display only those with a runtime > 0)' % ( print(blanc, ' %.3fs - in %d optimization that were not used (display only those with a runtime > 0)' % (
not_used_time, len(not_used)), file=stream) not_used_time, len(not_used)), file=stream)
not_used.sort(key=lambda nu: (nu[0], str(nu[1]))) not_used.sort(key=lambda nu: (nu[0], str(nu[1])))
for (t, o) in not_used[::-1]: for (t, o) in not_used[::-1]:
......
...@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name): ...@@ -70,7 +70,7 @@ def as_gpuarray_variable(x, context_name):
# If we couldn't deal with transfers, then maybe it's a tensor # If we couldn't deal with transfers, then maybe it's a tensor
if isinstance(x.type, tensor.TensorType): if isinstance(x.type, tensor.TensorType):
return GpuFromHost(context_name)(x) return gpu_from_host(context_name)(x)
# Try _as_GpuArrayVariable if possible # Try _as_GpuArrayVariable if possible
if hasattr(x, '_as_GpuArrayVariable'): if hasattr(x, '_as_GpuArrayVariable'):
...@@ -544,7 +544,7 @@ class HostFromGpu(Op): ...@@ -544,7 +544,7 @@ class HostFromGpu(Op):
def grad(self, inputs, grads): def grad(self, inputs, grads):
gz, = grads gz, = grads
return [GpuFromHost(inputs[0].type.context_name)(gz)] return [gpu_from_host(inputs[0].type.context_name)(gz)]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
ev, = eval_points ev, = eval_points
...@@ -647,6 +647,14 @@ class GpuFromHost(Op): ...@@ -647,6 +647,14 @@ class GpuFromHost(Op):
return (9,) return (9,)
# Caching GPUAlloc
def gpu_from_host(ctx):
if ctx not in gpu_alloc.cache:
gpu_from_host.cache[ctx] = GpuFromHost(ctx)
return gpu_from_host.cache[ctx]
gpu_from_host.cache = {}
class GpuToGpu(Op): class GpuToGpu(Op):
""" """
Transfer data between GPUs. Transfer data between GPUs.
...@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc): ...@@ -870,6 +878,15 @@ class GpuAlloc(HideC, Alloc):
return True return True
# Caching GPUAlloc
def gpu_alloc(ctx, memset_0=False):
key = (ctx, memset_0)
if key not in gpu_alloc.cache:
gpu_alloc.cache[key] = GpuAlloc(ctx, memset_0)
return gpu_alloc.cache[key]
gpu_alloc.cache = {}
class GpuAllocEmpty(HideC, Alloc): class GpuAllocEmpty(HideC, Alloc):
""" """
Allocate uninitialized memory on the GPU. Allocate uninitialized memory on the GPU.
...@@ -956,6 +973,14 @@ def empty_like(var): ...@@ -956,6 +973,14 @@ def empty_like(var):
return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape) return GpuAllocEmpty(var.type.dtype, var.type.context_name)(*var.shape)
def gpu_alloc_empty(ctx, dtype):
key = (dtype, ctx)
if key not in gpu_alloc_empty.cache:
gpu_alloc_empty.cache[key] = GpuAllocEmpty(dtype, ctx)
return gpu_alloc_empty.cache[key]
gpu_alloc_empty.cache = {}
class GpuContiguous(Op): class GpuContiguous(Op):
""" """
Return a C contiguous version of the input. Return a C contiguous version of the input.
...@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape): ...@@ -1031,6 +1056,7 @@ class GpuReshape(HideC, tensor.Reshape):
def make_node(self, x, shp): def make_node(self, x, shp):
ctx_name = infer_context_name(x) ctx_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name=ctx_name) x = as_gpuarray_variable(x, context_name=ctx_name)
shp = tensor.as_tensor_variable(shp)
res = host_from_gpu(x).reshape(shp, ndim=self.ndim) res = host_from_gpu(x).reshape(shp, ndim=self.ndim)
otype = GpuArrayType(dtype=res.dtype, otype = GpuArrayType(dtype=res.dtype,
broadcastable=res.broadcastable, broadcastable=res.broadcastable,
......
差异被折叠。
...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype): ...@@ -2587,6 +2587,18 @@ class GpuCAReduceCuda(GpuKernelBase, HideC, CAReduceDtype):
return kernels return kernels
# Caching GpuCAReduceCuda
def gpu_ca_reduce_cuda(scalar_op, axis=None, reduce_mask=None, dtype=None, acc_dtype=None,
pre_scalar_op=None):
key = (scalar_op, axis, reduce_mask, dtype, acc_dtype,
pre_scalar_op)
if key not in gpu_ca_reduce_cuda.cache:
gpu_ca_reduce_cuda.cache[key] = GpuCAReduceCuda(scalar_op, axis, reduce_mask, dtype,
acc_dtype, pre_scalar_op)
return gpu_ca_reduce_cuda.cache[key]
gpu_ca_reduce_cuda.cache = {}
class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype): class GpuCAReduceCPY(GpuKernelBase, HideC, CAReduceDtype):
""" """
CAReduce that reuse the python code from gpuarray. CAReduce that reuse the python code from gpuarray.
......
...@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division ...@@ -2,15 +2,14 @@ from __future__ import absolute_import, print_function, division
import os import os
from theano import Apply, Op from theano import Apply, Op
from theano.tensor.extra_ops import CumsumOp from theano.tensor.extra_ops import CumsumOp
from .basic_ops import infer_context_name
try: try:
from pygpu import gpuarray from pygpu import gpuarray
except ImportError: except ImportError:
pass pass
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, GpuReshape)
infer_context_name, GpuFromHost) from .opt import register_opt, op_lifter, register_opt2
from .opt import register_opt as register_gpu_opt, op_lifter
class GpuCumsum(GpuKernelBase, Op): class GpuCumsum(GpuKernelBase, Op):
...@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -40,7 +39,10 @@ class GpuCumsum(GpuKernelBase, Op):
def make_node(self, x): def make_node(self, x):
assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum" assert x.type.dtype == 'float32', "Only float32 supported for GpuCumSum"
x = as_gpuarray_variable(x, infer_context_name(x))
context_name = infer_context_name(x)
x = as_gpuarray_variable(x, context_name)
if x.ndim > GpuCumsum.SUPPORTED_NDIMS: if x.ndim > GpuCumsum.SUPPORTED_NDIMS:
raise NotImplementedError('Only cumsum on 1D, 2D and\ raise NotImplementedError('Only cumsum on 1D, 2D and\
...@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op): ...@@ -451,24 +453,23 @@ class GpuCumsum(GpuKernelBase, Op):
return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code return super(GpuCumsum, self).c_support_code_struct(node, nodename) + code
@register_opt('fast_compile')
@op_lifter([CumsumOp]) @op_lifter([CumsumOp])
def use_gpu_cumsumop(node, ctx_name): @register_opt2([CumsumOp], 'fast_compile')
if node.inputs[0].dtype == 'float32': def local_gpua_cumsumop(op, ctx_name, inputs, outputs):
axis = node.op.axis if inputs[0].dtype == 'float32':
x = node.inputs[0] axis = op.axis
x = inputs[0]
if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
return None return None
if axis is None and x.ndim > 1: x = as_gpuarray_variable(x, ctx_name)
x = x.flatten()
x = GpuFromHost(ctx_name)(x) if axis is None and x.ndim > 1:
x = GpuReshape(1)(x, (-1,))
# ``gpu_cumsum`` assume array has been flattened if needed. # ``gpu_cumsum`` assume array has been flattened if needed.
if axis is None: if axis is None:
axis = 0 axis = 0
return GpuCumsum(axis)(x) return GpuCumsum(axis)(x)
register_gpu_opt()(use_gpu_cumsumop)
...@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType ...@@ -9,7 +9,7 @@ from theano.gradient import DisconnectedType
from theano.gpuarray import (basic_ops, GpuArrayType) from theano.gpuarray import (basic_ops, GpuArrayType)
import theano.tensor.fft import theano.tensor.fft
from .opt import register_opt, op_lifter from .opt import register_opt, op_lifter, register_opt2
try: try:
import pygpu import pygpu
...@@ -373,10 +373,12 @@ def _unitary(norm): ...@@ -373,10 +373,12 @@ def _unitary(norm):
if scikits_cuda_available: if scikits_cuda_available:
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.fft.RFFTOp]) @op_lifter([theano.tensor.fft.RFFTOp])
def local_curfft_op(node, context_name): @register_opt2([theano.tensor.fft.RFFTOp], 'fast_compile')
def local_gpua_curfft_op(op, ctx_name, inputs, outputs):
return curfft_op return curfft_op
@register_opt('fast_compile') @register_opt('fast_compile')
@op_lifter([theano.tensor.fft.IRFFTOp]) @op_lifter([theano.tensor.fft.IRFFTOp])
def local_cuirfft_op(node, context_name): @register_opt2([theano.tensor.fft.IRFFTOp], 'fast_compile')
def local_gpua_cuirfft_op(op, ctx_name, inputs, outputs):
return cuirfft_op return cuirfft_op
...@@ -14,7 +14,7 @@ from theano.gof import Op ...@@ -14,7 +14,7 @@ from theano.gof import Op
from theano.tensor import NotScalarConstantError, get_scalar_constant_value from theano.tensor import NotScalarConstantError, get_scalar_constant_value
from theano import gpuarray from theano import gpuarray
from .basic_ops import as_gpuarray_variable, infer_context_name from .basic_ops import as_gpuarray_variable, infer_context_name
from .opt import register_opt, op_lifter from .opt import register_opt, op_lifter, register_opt2
from .type import GpuArrayType from .type import GpuArrayType
...@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial( ...@@ -227,23 +227,24 @@ KERNEL void k_multi_warp_multinomial(
return (1,) return (1,)
@register_opt() @register_opt('fast_compile')
@op_lifter([theano.sandbox.multinomial.MultinomialFromUniform]) @op_lifter([theano.sandbox.multinomial.MultinomialFromUniform])
def local_gpua_multinomial(node, context_name): @register_opt2([theano.sandbox.multinomial.MultinomialFromUniform], 'fast_compile')
def local_gpua_multinomial(op, context_name, inputs, outputs):
# TODO : need description for function # TODO : need description for function
if len(node.inputs) == 2: if len(inputs) == 2:
p, u = node.inputs p, u = inputs
n_samples = 1 n_samples = 1
else: else:
p, u, n_samples = node.inputs p, u, n_samples = inputs
try: try:
if get_scalar_constant_value(n_samples) != 1: if get_scalar_constant_value(n_samples) != 1:
return None return None
except NotScalarConstantError: except NotScalarConstantError:
return None return None
m, = node.outputs m, = outputs
if (p.dtype == u.dtype == m.dtype == 'float32'): if (p.dtype == u.dtype == m.dtype == 'float32'):
gpu_op = GPUAMultinomialFromUniform(node.op.odtype) gpu_op = GPUAMultinomialFromUniform(op.odtype)
return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])( return gpuarray.elemwise.GpuDimShuffle([False, False], [1, 0])(
gpu_op(p, u)) gpu_op(p, u))
...@@ -13,7 +13,7 @@ except ImportError: ...@@ -13,7 +13,7 @@ except ImportError:
from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel, from .basic_ops import (as_gpuarray_variable, GpuKernelBase, Kernel,
infer_context_name) infer_context_name)
from .opt import register_opt as register_gpu_opt, op_lifter from .opt import register_opt2, op_lifter, register_opt
from .type import GpuArrayType from .type import GpuArrayType
...@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op): ...@@ -468,9 +468,9 @@ class GpuImages2Neibs(GpuKernelBase, Images2Neibs, Op):
Op.perform(self, node, inp, out, ctx) Op.perform(self, node, inp, out, ctx)
@register_opt('fast_compile')
@op_lifter([Images2Neibs]) @op_lifter([Images2Neibs])
def use_gpu_images2neibs(node, context_name): @register_opt2([Images2Neibs], 'fast_compile')
if node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']: def local_gpua_images2neibs(op, context_name, inputs, outputs):
return GpuImages2Neibs(node.op.mode) if op.mode in ['valid', 'ignore_borders', 'wrap_centered']:
return GpuImages2Neibs(op.mode)
register_gpu_opt()(use_gpu_images2neibs)
...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant ...@@ -10,7 +10,7 @@ from theano.scalar import as_scalar, constant
from . import opt from . import opt
from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty, from .basic_ops import (as_gpuarray_variable, GpuAllocEmpty,
infer_context_name) infer_context_name, gpu_alloc_empty)
from .type import gpu_context_type from .type import gpu_context_type
from .opt_util import alpha_merge, output_merge from .opt_util import alpha_merge, output_merge
...@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz, ...@@ -147,17 +147,18 @@ if (GpuKernel_init(&k_%(name)s, c->ctx, 1, &bcode, &sz,
return '\n'.join(codel) return '\n'.join(codel)
@opt.register_opt() @opt.register_opt('fast_compile')
@opt.op_lifter([tensor.Dot]) @opt.op_lifter([tensor.Dot])
def local_dot_to_gemm16(node, ctx_name): @opt.register_opt2([tensor.Dot], 'fast_compile')
def local_gpua_dot_to_gemm16(op, ctx_name, inputs, outputs):
if nerv is None: if nerv is None:
return return
A = node.inputs[0] A = inputs[0]
B = node.inputs[1] B = inputs[1]
if (A.ndim == 2 and B.ndim == 2 and if (A.ndim == 2 and B.ndim == 2 and
A.dtype == 'float16' and B.dtype == 'float16'): A.dtype == 'float16' and B.dtype == 'float16'):
fgraph = node.inputs[0].fgraph fgraph = getattr(outputs[0], 'fgraph', None)
C = GpuAllocEmpty(dtype='float16', context_name=ctx_name)( C = gpu_alloc_empty(ctx_name, dtype='float16')(
shape_i(A, 0, fgraph), shape_i(B, 1, fgraph)) shape_i(A, 0, fgraph), shape_i(B, 1, fgraph))
return Gemm16()(C, 1.0, A, B, 0.0) return Gemm16()(C, 1.0, A, B, 0.0)
......
差异被折叠。
...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer ...@@ -8,7 +8,7 @@ from theano.gof import local_optimizer
from theano.tensor import (DimShuffle, get_scalar_constant_value, from theano.tensor import (DimShuffle, get_scalar_constant_value,
NotScalarConstantError) NotScalarConstantError)
from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty from .basic_ops import GpuFromHost, HostFromGpu, GpuAllocEmpty, gpu_alloc_empty
from .elemwise import GpuDimShuffle, GpuElemwise from .elemwise import GpuDimShuffle, GpuElemwise
_one = scal.constant(numpy.asarray(1.0, dtype='float32')) _one = scal.constant(numpy.asarray(1.0, dtype='float32'))
...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx): ...@@ -324,8 +324,7 @@ def inplace_allocempty(op, idx):
if (alloc.owner and if (alloc.owner and
isinstance(alloc.owner.op, GpuAllocEmpty) and isinstance(alloc.owner.op, GpuAllocEmpty) and
len(alloc.clients) > 1): len(alloc.clients) > 1):
alloc_op = GpuAllocEmpty(alloc.owner.op.dtype, alloc_op = gpu_alloc_empty(alloc.owner.op.context_name, dtype=alloc.owner.op.dtype)
alloc.owner.op.context_name)
inputs[idx] = alloc_op(*alloc.owner.inputs) inputs[idx] = alloc_op(*alloc.owner.inputs)
return maker(node, inputs) return maker(node, inputs)
return opt return opt
......
...@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d): ...@@ -26,9 +26,11 @@ class TestDnnConv2d(test_abstract_conv.BaseTestConv2d):
if not dnn_available(test_ctx_name): if not dnn_available(test_ctx_name):
raise SkipTest(dnn_available.msg) raise SkipTest(dnn_available.msg)
mode = mode_with_gpu mode = mode_with_gpu
if fd != (1, 1): if fd != (1, 1):
raise SkipTest("Doesn't have CUDNN implementation") raise SkipTest("Doesn't have CUDNN implementation")
o = self.get_output_shape(i, f, s, b, fd) o = self.get_output_shape(i, f, s, b, fd)
self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s, self.run_fwd(inputs_shape=i, filters_shape=f, subsample=s,
verify_grad=True, mode=mode, verify_grad=True, mode=mode,
provide_shape=provide_shape, border_mode=b, provide_shape=provide_shape, border_mode=b,
......
...@@ -396,7 +396,7 @@ def test_gpueye(): ...@@ -396,7 +396,7 @@ def test_gpueye():
k_symb = numpy.asarray(0) k_symb = numpy.asarray(0)
out = T.eye(N_symb, M_symb, k_symb, dtype=dtype) out = T.eye(N_symb, M_symb, k_symb, dtype=dtype)
f = theano.function([N_symb, M_symb], f = theano.function([N_symb, M_symb],
out, T.stack(out),
mode=mode_with_gpu) mode=mode_with_gpu)
result = numpy.asarray(f(N, M)) result = numpy.asarray(f(N, M))
assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype)) assert numpy.allclose(result, numpy.eye(N, M_, dtype=dtype))
......
...@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0(): ...@@ -138,11 +138,21 @@ def test_local_gpualloc_memset_0():
ones = numpy.ones((2,), dtype='float32') ones = numpy.ones((2,), dtype='float32')
# Test with 0 from CPU op. # Test with 0 from CPU op.
# Should not be transfered as the only client is the output
a = tensor.alloc(z, i) a = tensor.alloc(z, i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 1
assert isinstance(topo[0].op, GpuAlloc) and topo[0].op.memset_0 assert isinstance(topo[0].op, theano.tensor.Alloc)
assert (numpy.asarray(f(6)) == 0).all()
# Test with 0 from CPU op.
# Should be transfered as it is used by another op.
a = tensor.alloc(z, i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAlloc)
assert (numpy.asarray(f(6)) == 0).all() assert (numpy.asarray(f(6)) == 0).all()
# Test with 0 # Test with 0
...@@ -177,19 +187,30 @@ def test_local_gpualloc_empty(): ...@@ -177,19 +187,30 @@ def test_local_gpualloc_empty():
ii = theano.tensor.iscalar() ii = theano.tensor.iscalar()
# Test with vector # Test with vector
# Should not be moved as the only client is the output
a = tensor.AllocEmpty('float32')(i) a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a, mode=mode_with_gpu) f = theano.function([i], a, mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 1
assert isinstance(topo[0].op, theano.tensor.AllocEmpty)
# This return not initilized data, so we can only check the shape
assert f(3).shape == (3,)
# Test with vector
# Should be moved
a = tensor.AllocEmpty('float32')(i)
f = theano.function([i], a.cumsum(), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort()
assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3).shape == (3,) assert f(3).shape == (3,)
# Test with matrix # Test with matrix
a = tensor.AllocEmpty('float32')(i, ii) a = tensor.AllocEmpty('float32')(i, ii)
f = theano.function([i, ii], a, mode=mode_with_gpu) f = theano.function([i, ii], a.cumsum(axis=0), mode=mode_with_gpu)
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert len(topo) == 2 assert len(topo) == 3
assert isinstance(topo[0].op, GpuAllocEmpty) assert isinstance(topo[0].op, GpuAllocEmpty)
# This return not initilized data, so we can only check the shape # This return not initilized data, so we can only check the shape
assert f(3, 4).shape == (3, 4) assert f(3, 4).shape == (3, 4)
...@@ -334,7 +355,10 @@ def test_local_gpu_subtensor(): ...@@ -334,7 +355,10 @@ def test_local_gpu_subtensor():
topo = f.maker.fgraph.toposort() topo = f.maker.fgraph.toposort()
assert any([type(node.op) is tensor.Subtensor for node in topo]) assert any([type(node.op) is tensor.Subtensor for node in topo])
assert not any([isinstance(node.op, GpuSubtensor) for node in topo]) assert not any([isinstance(node.op, GpuSubtensor) for node in topo])
assert any([isinstance(node.op, GpuElemwise) for node in topo]) # Our optimizer isn't smart enough to move to the GPU Elemwise.
# If it where just a little bit smarter, it could wrongly move it to the GPU.
# If it where super smart, it would know it should not move it to the GPU.
assert any([isinstance(node.op, tensor.Elemwise) for node in topo])
def test_local_gpu_elemwise(): def test_local_gpu_elemwise():
...@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op(): ...@@ -427,7 +451,7 @@ def test_local_assert_no_cpu_op():
out = theano.tensor.tanh(ms).dot(ms.T) out = theano.tensor.tanh(ms).dot(ms.T)
mode_local_assert = mode_with_gpu.including("assert_no_cpu_op") mode_local_assert = mode_with_gpu.including("assert_no_cpu_op")
mode_local_assert = mode_local_assert.excluding("local_gpu_elemwise") mode_local_assert = mode_local_assert.excluding("local_gpua_elemwise")
old = theano.config.assert_no_cpu_op old = theano.config.assert_no_cpu_op
old2 = theano.config.on_opt_error old2 = theano.config.on_opt_error
......
...@@ -233,7 +233,7 @@ class GpuArrayType(Type): ...@@ -233,7 +233,7 @@ class GpuArrayType(Type):
return data return data
def filter_variable(self, other, allow_convert=True): def filter_variable(self, other, allow_convert=True):
from theano.gpuarray import GpuFromHost from theano.gpuarray.basic_ops import gpu_from_host
if hasattr(other, '_as_GpuArrayVariable'): if hasattr(other, '_as_GpuArrayVariable'):
other = other._as_GpuArrayVariable(self.context_name) other = other._as_GpuArrayVariable(self.context_name)
...@@ -265,7 +265,7 @@ class GpuArrayType(Type): ...@@ -265,7 +265,7 @@ class GpuArrayType(Type):
str(self.broadcastable))) str(self.broadcastable)))
other = other2 other = other2
return GpuFromHost(self.context_name)(other) return gpu_from_host(self.context_name)(other)
@staticmethod @staticmethod
def values_eq(a, b, force_same_dtype=True): def values_eq(a, b, force_same_dtype=True):
......
...@@ -24,10 +24,11 @@ from . import multinomial ...@@ -24,10 +24,11 @@ from . import multinomial
import theano.sandbox.cuda import theano.sandbox.cuda
from theano.sandbox.cuda import GpuOp from theano.sandbox.cuda import GpuOp
from theano.gpuarray.basic_ops import GpuKernelBase, Kernel from theano.gpuarray.basic_ops import GpuKernelBase, Kernel, infer_context_name
from theano.gpuarray.type import GpuArrayType from theano.gpuarray.type import GpuArrayType
from theano.gpuarray.fp16_help import write_w from theano.gpuarray.fp16_help import write_w
from theano.gpuarray.opt import (register_opt as register_gpua, from theano.gpuarray.opt import (register_opt as register_gpua,
register_opt2,
host_from_gpu as host_from_gpua) host_from_gpu as host_from_gpua)
if theano.sandbox.cuda.cuda_available: if theano.sandbox.cuda.cuda_available:
from theano.sandbox.cuda import (CudaNdarrayType, from theano.sandbox.cuda import (CudaNdarrayType,
...@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object): ...@@ -1551,17 +1552,22 @@ class MRG_RandomStreams(object):
return final_samples return final_samples
@register_opt2([mrg_uniform], 'fast_compile')
def local_gpua_mrg_graph(op, context_name, inputs, outputs):
if (type(op) == mrg_uniform and
isinstance(inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(inputs[0],
op.output_type.ndim,
op.output_type.dtype,
inputs[1])
return [outs[0], host_from_gpua(outs[1])]
@register_gpua('fast_compile') @register_gpua('fast_compile')
@local_optimizer([mrg_uniform]) @local_optimizer([mrg_uniform])
def local_gpua_mrg(node): def local_gpua_mrg(node):
# TODO : need description for function context_name = infer_context_name(*node.inputs)
if (type(node.op) == mrg_uniform and return local_gpua_mrg_graph(node.op, context_name, node.inputs, node.outputs)
isinstance(node.inputs[0].type, GpuArrayType)):
outs = GPUA_mrg_uniform.new(node.inputs[0],
node.op.output_type.ndim,
node.op.output_type.dtype,
node.inputs[1])
return [outs[0], host_from_gpua(outs[1])]
MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform) MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)
......
...@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -152,13 +152,15 @@ def traverse(out, x, x_copy, d, visited=None):
return d return d
visited.add(out) visited.add(out)
from theano.sandbox import cuda from theano.sandbox import cuda
from theano import gpuarray from theano.gpuarray.basic_ops import gpu_from_host, host_from_gpu
from theano.gpuarray import pygpu_activated
from theano.gpuarray.type import GpuArrayType
if out == x: if out == x:
if isinstance(x.type, cuda.CudaNdarrayType): if isinstance(x.type, cuda.CudaNdarrayType):
d[out] = cuda.gpu_from_host(x_copy) d[out] = cuda.gpu_from_host(x_copy)
else: else:
assert isinstance(x.type, gpuarray.GpuArrayType) assert isinstance(x.type, GpuArrayType)
d[out] = gpuarray.GpuFromHost(x.type.context_name)(x_copy) d[out] = gpu_from_host(x.type.context_name)(x_copy)
return d return d
elif out.owner is None: elif out.owner is None:
return d return d
...@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None): ...@@ -167,8 +169,8 @@ def traverse(out, x, x_copy, d, visited=None):
out.owner.inputs == [x]): out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy) d[out] = tensor.as_tensor_variable(x_copy)
return d return d
elif (gpuarray.pygpu_activated and elif (pygpu_activated and
out.owner.op == gpuarray.host_from_gpu and out.owner.op == host_from_gpu and
out.owner.inputs == [x]): out.owner.inputs == [x]):
d[out] = tensor.as_tensor_variable(x_copy) d[out] = tensor.as_tensor_variable(x_copy)
return d return d
......
...@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True, ...@@ -630,9 +630,15 @@ def get_scalar_constant_value(orig_v, elemwise=True,
v = v.owner.inputs[0] v = v.owner.inputs[0]
continue continue
elif isinstance(v.owner.op, theano.compile.ops.Shape_i): elif isinstance(v.owner.op, theano.compile.ops.Shape_i):
if isinstance(v.owner.inputs[0], Constant): i = v.owner.op.i
return numpy.asarray( inp = v.owner.inputs[0]
v.owner.inputs[0].data.shape[v.owner.op.i]) if isinstance(inp, Constant):
return numpy.asarray(inp.data.shape[i])
# The shape of a broadcastable dimension is 1
if (hasattr(inp.type, 'broadcastable') and
inp.type.broadcastable[i]):
return numpy.asarray(1)
# Don't act as the constant_folding optimization here as this # Don't act as the constant_folding optimization here as this
# fct is used too early in the optimization phase. This would # fct is used too early in the optimization phase. This would
# mess with the stabilization optimization and be too slow. # mess with the stabilization optimization and be too slow.
...@@ -2690,15 +2696,18 @@ class Alloc(gof.Op): ...@@ -2690,15 +2696,18 @@ class Alloc(gof.Op):
sh = [as_tensor_variable(s) for s in shape] sh = [as_tensor_variable(s) for s in shape]
bcast = [] bcast = []
for i, s in enumerate(sh): for i, s in enumerate(sh):
if config.exception_verbosity == 'high': def err_str():
s_as_str = '\n' + min_informative_str(s) if config.exception_verbosity == 'high':
else: return '\n' + min_informative_str(s)
s_as_str = str(s) else:
return str(s)
if s.type.dtype[:3] not in ('int', 'uin'): if s.type.dtype[:3] not in ('int', 'uin'):
s_as_str = err_str()
raise TypeError('Shape arguments to Alloc must be integers, ' raise TypeError('Shape arguments to Alloc must be integers, '
'but argument %s is not for apply node: %s' % 'but argument %s is not for apply node: %s' %
(i, s_as_str)) (i, s_as_str))
if s.ndim != 0: if s.ndim != 0:
s_as_str = err_str()
raise TypeError( raise TypeError(
"Each shape dimension to Alloc must be a scalar, ", "Each shape dimension to Alloc must be a scalar, ",
'but dimension %s have %d dimensions for apply node: %s' % 'but dimension %s have %d dimensions for apply node: %s' %
......
...@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape, ...@@ -66,8 +66,10 @@ def get_conv_output_shape(image_shape, kernel_shape,
""" """
bsize, imshp = image_shape[0], image_shape[2:] bsize, imshp = image_shape[0], image_shape[2:]
nkern, kshp = kernel_shape[0], kernel_shape[2:] nkern, kshp = kernel_shape[0], kernel_shape[2:]
if filter_dilation is None: if filter_dilation is None:
filter_dilation = numpy.ones(len(subsample), dtype='int') filter_dilation = numpy.ones(len(subsample), dtype='int')
if isinstance(border_mode, tuple): if isinstance(border_mode, tuple):
out_shp = tuple(get_conv_shape_1axis( out_shp = tuple(get_conv_shape_1axis(
imshp[i], kshp[i], border_mode[i], imshp[i], kshp[i], border_mode[i],
...@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode, ...@@ -121,7 +123,16 @@ def get_conv_shape_1axis(image_shape, kernel_shape, border_mode,
pad = border_mode pad = border_mode
if pad < 0: if pad < 0:
raise ValueError("border_mode must be >= 0") raise ValueError("border_mode must be >= 0")
out_shp = (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
# In case of symbolic shape, we want to build the smallest graph
# (image_shape + 2 * pad - dil_kernel_shape) // subsample + 1
if pad == 0:
out_shp = (image_shape - dil_kernel_shape)
else:
out_shp = (image_shape + 2 * pad - dil_kernel_shape)
if subsample != 1:
out_shp = out_shp // subsample
out_shp = out_shp + 1
return out_shp return out_shp
......
...@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase): ...@@ -7003,6 +7003,9 @@ class T_get_scalar_constant_value(unittest.TestCase):
assert get_scalar_constant_value(s) == 3 assert get_scalar_constant_value(s) == 3
s = opt.Shape_i(1)(c) s = opt.Shape_i(1)(c)
assert get_scalar_constant_value(s) == 4 assert get_scalar_constant_value(s) == 4
d = theano.shared(numpy.random.randn(1,1), broadcastable=(True, True))
f = theano.tensor.basic.ScalarFromTensor()(opt.Shape_i(0)(d))
assert get_scalar_constant_value(f) == 1
def test_elemwise(self): def test_elemwise(self):
# We test only for a few elemwise, the list of all supported # We test only for a few elemwise, the list of all supported
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论