提交 8bd900f8 authored 作者: Frédéric Bastien's avatar Frédéric Bastien

Merge pull request #1582 from abergeron/compyte2-rb

Compyte2 rebase
......@@ -932,7 +932,7 @@ class T_subtensor(theano.tensor.tests.test_subtensor.T_subtensor):
adv_incsub1 = cuda.GpuAdvancedIncSubtensor1
mode = mode_with_gpu
dtype = 'float32'
ignore_topo = (B.HostFromGpu, B.GpuFromHost)
ignore_topo = (B.HostFromGpu, B.GpuFromHost, theano.compile.DeepCopyOp)
fast_compile = False
ops = (cuda.GpuSubtensor, cuda.GpuIncSubtensor,
cuda.GpuAdvancedSubtensor1, cuda.GpuAdvancedIncSubtensor1)
......
import logging
import theano
from theano.configparser import config
from theano.configparser import config, AddConfigVar, BoolParam
from theano.compile import optdb
_logger_name = 'theano.sandbox.gpuarray'
......@@ -18,6 +18,13 @@ try:
except ImportError:
pygpu = None
AddConfigVar('gpuarray.sync',
"""If True, every op will make sure its work is done before
returning. Setting this to True will slow down execution,
but give much more accurate results in profiling.""",
BoolParam(False),
in_c_key=True)
# This is for documentation not to depend on the availability of pygpu
from type import (GpuArrayType, GpuArrayVariable, GpuArrayConstant,
GpuArraySharedVariable, gpuarray_shared_constructor)
......
from theano import Op, Apply, config
from theano.tensor.blas import Gemv, Gemm
from theano.sandbox.gpuarray.basic_ops import (HideC, as_gpuarray_variable)
try:
import pygpu
from pygpu import blas
except ImportError, e:
# To make sure theano is importable
pass
class BlasOp(HideC):
def c_headers(self):
return ['<blas_api.h>']
def c_header_dirs(self):
return [pygpu.get_include()]
def c_init_code(self):
return ['import_pygpu__blas();']
class GpuGemv(BlasOp, Gemv):
def make_node(self, y, alpha, A, x, beta):
res = Gemv.make_node(self, y, alpha, A, x, beta)
A = as_gpuarray_variable(A)
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
return Apply(self, [y, alpha, A, x, beta], [y.type()])
def perform(self, node, inputs, out_storage):
y, alpha, A, x, beta = inputs
out_storage[0][0] = blas.gemv(alpha, A, x, beta, y, trans=False,
overwrite_y=self.inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], y=inp[0], alpha=inp[1], A=inp[2], x=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
if self.inplace:
code = """
Py_XDECREF(%(out)s);
%(out)s = %(y)s;
Py_INCREF(%(out)s);
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(y)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rgemv(cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(x)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
"""
return code
def c_code_cache_version(self):
return (0,)
gpugemv_no_inplace = GpuGemv(inplace=False)
gpugemv_inplace = GpuGemv(inplace=True)
class GpuGemm(BlasOp, Gemm):
def make_node(self, C, alpha, A, B, beta):
res = Gemm.make_node(self, C, alpha, A, B, beta)
A = as_gpuarray_variable(A)
B = as_gpuarray_variable(B)
C = as_gpuarray_variable(C)
return Apply(self, [C, alpha, A, B, beta], [C.type()])
def perform(self, node, inputs, outputs):
C, alpha, A, B, beta = inputs
outputs[0][0] = blas.gemm(alpha, A, B, beta, C,
overwrite_c=self.inplace)
def c_code(self, node, name, inp, out, sub):
vars = dict(out=out[0], C=inp[0], alpha=inp[1], A=inp[2], B=inp[3],
beta=inp[4], fail=sub['fail'], name=name)
if self.inplace:
code = """
Py_XDECREF(%(out)s);
%(out)s = %(C)s;
Py_INCREF(%(out)s);
""" % vars
else:
code = """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(C)s, GA_ANY_ORDER);
if (%(out)s == NULL) {
%(fail)s
}
""" % vars
code += """
if (pygpu_blas_rgemm(cb_no_trans, cb_no_trans,
((dtype_%(alpha)s *)PyArray_DATA(%(alpha)s))[0],
%(A)s, %(B)s,
((dtype_%(beta)s *)PyArray_DATA(%(beta)s))[0],
%(out)s) == NULL) {
%(fail)s
}
""" % vars
if config.gpuarray.sync:
code += """
GpuArray_sync(&%(out)s->ga);
"""
return code
def c_code_cache_version(self):
return (0,)
gpugemm_no_inplace = GpuGemm(inplace=False)
gpugemm_inplace = GpuGemm(inplace=True)
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out
@local_optimizer([gpugemv_no_inplace])
def local_inplace_gpuagemv(node):
if node.op == gpugemv_no_inplace:
return [gpugemv_inplace(*node.inputs)]
@local_optimizer([gpugemm_no_inplace])
def local_inplace_gpuagemm(node):
if node.op == gpugemm_no_inplace:
return [gpugemm_inplace(*node.inputs)]
gpuablas_opt_inplace = in2out(LocalOptGroup(
local_inplace_gpuagemv, local_inplace_gpuagemm),
name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt',
gpuablas_opt_inplace,
70.0, 'fast_run', 'inplace', 'gpuarray')
import theano, numpy
from theano import tensor
import copy
import theano
import numpy
from theano import tensor, scalar
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
Optimizer, toolbox, DestroyHandler,
......@@ -8,8 +10,12 @@ from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
from elemwise import GpuElemwise, _is_scalar
from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
gpu_alloc, GpuReshape)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm
gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
......@@ -26,6 +32,7 @@ optdb.register('gpuarray_opt', gpu_seqopt,
optdb.__position__.get('add_destroy_handler', 49.5) - 1,
'gpuarray')
def register_opt(*tags, **kwargs):
def f(local_opt):
name = (kwargs and kwargs.pop('name')) or local_opt.__name__
......@@ -35,6 +42,36 @@ def register_opt(*tags, **kwargs):
register_opt()(theano.tensor.opt.local_track_shape_i)
def op_lifter(OP):
"""
OP(..., host_from_gpu(), ...) -> host_from_gpu(GpuOP(...))
gpu_from_host(OP(inp0, ...)) -> GpuOP(inp0, ...)
"""
def f(maker):
def local_opt(node):
if type(node.op) is OP:
# This does not support nodes that have more than one output.
assert len(node.outputs) == 1
# either one of our inputs is on the gpu or
# all of our client are on the gpu
if (any([i.owner and i.owner.op == host_from_gpu
for i in node.inputs]) or
all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients])):
new_op = maker(node)
# This is needed as sometimes new_op inherit from OP.
if new_op and new_op != node.op:
if isinstance(new_op, theano.Op):
return [host_from_gpu(new_op(*node.inputs))]
else: # suppose it is a variable on the GPU
return [host_from_gpu(new_op)]
return False
local_opt.__name__ = maker.__name__
return local_optimizer([OP])(local_opt)
return f
class InputToGpuOptimizer(Optimizer):
"Transfer the input to the gpu to start the rolling wave."
......@@ -63,6 +100,7 @@ class InputToGpuOptimizer(Optimizer):
gpu_seqopt.register('InputToGpuArrayOptimizer', InputToGpuOptimizer(),
0, 'fast_run', 'fast_compile', 'merge')
@local_optimizer([])
def local_cut_gpu_host_gpu(node):
if tensor.opt.opt.check_chain(node, gpu_from_host, host_from_gpu):
......@@ -78,67 +116,117 @@ gpu_cut_copies.register('cut_gpua_constant_transfers',
optdb['canonicalize'].register('local_cut_gpua_host_gpua',
local_cut_gpu_host_gpu, 'fast_run', 'gpuarray')
@register_opt()
@local_optimizer([tensor.Alloc])
@op_lifter(tensor.Alloc)
def local_gpualloc(node):
replace = False
if node.op == tensor.alloc:
if node.inputs[0].owner and node.inputs[0].owner.op == host_from_gpu:
replace = True
elif all([c != 'output' and c.op == gpu_from_host
for c, idx in node.outputs[0].clients]):
replace = True
elif all([c != 'output' and c.op == tensor.join and
all([i.owner and i.owner.op in [host_from_gpu, tensor.alloc]
for i in c.inputs[1:]])
for c, idx in node.outputs[0].clients]):
replace = True
if replace:
val = node.inputs[0]
shp = node.inputs[1:]
old_out = node.outputs[0]
val2 = tensor.shape_padleft(val, len(shp) - val.ndim)
new_out = host_from_gpu(gpu_alloc(val, *shp))
if new_out.type != old_out.type:
assert new_out.type.ndim == old_out.type.ndim
assert new_out.type.dtype == old_out.type.dtype
for b_old, b_new in zip(old_out.type.broadcastable,
new_out.type.broadcastable):
assert b_new or (not b_old)
new_out = tensor.patternbroadcast(new_out. old_out.broadcastable)
return [new_out]
return gpu_alloc
@register_opt()
@local_optimizer([])
@op_lifter(tensor.Reshape)
def local_gpureshape(node):
op = node.op
name = op.name
if name:
name = 'Gpu' + name
res = GpuReshape(op.ndim, op.name)
return res
@register_opt()
@op_lifter(tensor.Flatten)
def local_gpuflatten(node):
op = node.op
if op.outdim != 1:
return None
res = GpuReshape(op.outdim, None)
o = res(node.inputs[0], theano.tensor.constant([-1]))
return o
@register_opt()
@op_lifter(tensor.Elemwise)
def local_gpu_elemwise(node):
do_replace = False
gpu_out = False
# check for gpu_from_host(Elemwise)) and extract the Elemwise node
if node.op == gpu_from_host:
host_i, = node.inputs
if (host_i.owner and
isinstance(host_i.owner.op, tensor.Elemwise) and
len(host_i.clients) == 1):
node = host_i.owner
do_replace = True
gpu_out = True
# check for elemwise(..., host_from_gpu, ...)
if isinstance(node.op, tensor.Elemwise):
if numpy.any([i.owner and
i.owner.op == host_from_gpu
for i in node.inputs]):
do_replace = True
if numpy.all([_is_scalar(i)
for i in node.inputs]):
do_replace = False
if do_replace:
new_op = GpuElemwise(node.op.scalar_op)
gpu_elemwise = new_op(*(gpu_from_host(i) for i in node.inputs))
if gpu_out:
return [gpu_elemwise]
else:
return [host_from_gpu(gpu_elemwise)]
else:
return False
op = node.op
name = op.name
if name:
name = 'Gpu'+name
res = GpuElemwise(op.scalar_op, name=name,
inplace_pattern=copy.copy(op.inplace_pattern),
nfunc_spec=op.nfunc_spec)
return res
def max_inputs_to_GpuElemwise(node):
ptr_size = 8
int_size = 4
# we take the limit from CUDA for now
argument_limit = 232
ndim = node.inputs[0].type.ndim
# number of elements and shape
size_param_mandatory = (int_size * (ndim + 1)) + \
(ptr_size + int_size * ndim) * len(node.outputs)
nb_bytes_avail = argument_limit - size_param_mandatory
nb_bytes_per_input = ptr_size + ndim * int_size
max_nb_inputs = nb_bytes_avail // nb_bytes_per_input
return max_nb_inputs
gpu_local_elemwise_fusion = tensor.opt.local_elemwise_fusion_op(
GpuElemwise,
max_inputs_to_GpuElemwise)
optdb.register('gpua_elemwise_fusion',
tensor.opt.FusionOptimizer(gpu_local_elemwise_fusion), 71.00,
'fast_run', 'fusion', 'local_elemwise_fusion', 'gpuarray')
inplace_gpu_elemwise_opt = tensor.opt.inplace_elemwise_optimizer_op(
GpuElemwise)
optdb.register('gpua_inplace_opt', inplace_gpu_elemwise_opt, 75,
'inplace_elemwise_optimizer', 'fast_run', 'inplace', 'gpuarray')
@register_opt()
@op_lifter(tensor.DimShuffle)
def local_gpua_dimshuffle(node):
return GpuDimShuffle(node.op.input_broadcastable,
node.op.new_order)
@register_opt()
@op_lifter(tensor.SpecifyShape)
def local_gpua_specifyShape(node):
return tensor.specify_shape
@register_opt()
@op_lifter(tensor.Subtensor)
def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list)
@register_opt()
@op_lifter(tensor.CAReduce)
def local_gpua_careduce(node):
if (isinstance(node.op.scalar_op, scalar.basic.Add) or
isinstance(node.op.scalar_op, scalar.basic.Mul)):
return GpuCAReduce(node.op.scalar_op, axis=node.op.axis,
dtype=getattr(node.op, 'dtype', None),
acc_dtype=getattr(node.op, 'acc_dtype', None))
@register_opt()
@op_lifter(tensor.blas.Gemv)
def local_gpua_gemv(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.blas_c.CGemv)
def local_gpua_gemv2(node):
return GpuGemv(inplace=node.op.inplace)
@register_opt()
@op_lifter(tensor.blas.Gemm)
def local_gpua_gemm(node):
return GpuGemm(inplace=node.op.inplace)
import StringIO
import numpy
import theano
from theano import tensor, gof
from theano.tensor.subtensor import Subtensor, get_idx_list
from theano.gof.python25 import all, any
try:
import pygpu
from pygpu import gpuarray
except ImportError:
pass
from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
class GpuSubtensor(HideC, Subtensor):
def make_node(self, x, *inputs):
rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = GpuArrayType(dtype=rval.outputs[0].type.dtype,
broadcastable=rval.outputs[0].type.broadcastable)
x = as_gpuarray_variable(x)
return gof.Apply(self, [x] + rval.inputs[1:], [otype()])
def perform(self, node, inputs, out_):
out, = out_
x = inputs[0]
if self.perform_cache_cdata is not None:
out[0] = x.__getitem__(self.perform_cache_cdata)
return
cdata = get_idx_list(inputs, self.idx_list)
if len(cdata) == 1:
cdata = cdata[0]
if len(inputs) == 1:
self.perform_cache_cdata = cdata
out[0] = x.__getitem__(cdata)
def c_support_code(self):
return """
static int fix_indices(ssize_t *start, ssize_t *stop, ssize_t *step,
int start_n, int stop_n, int step_n,
size_t len) {
if (step_n) *step = 1;
if (*step == 0) {
PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
return -1;
}
if (start_n) *start = (*step < 0) ? len-1 : 0;
else {
if (*start < 0) *start += len;
if (*start < 0) *start = (*step < 0) ? -1 : 0;
if (*start >= len) *start = (*step < 0) ? len-1 : len;
}
if (stop_n) *stop = (*step < 0) ? -1 : len;
else {
if (*stop < 0) *stop += len;
if (*stop < 0) *stop = (*step < 0) ? -1 : 0;
if (*stop >= len) *stop = (*step < 0) ? len-1 : len;
}
if (*stop < *start && *step > 0)
*stop = *start;
return 0;
}
"""
def c_code(self, node, name, inputs, outputs, sub):
inp_ndim = node.inputs[0].ndim
inp = inputs[0]
indices = inputs[1:]
# pad out the index list to the same dimension as the input
idx_list = self.idx_list + \
((slice(None),) * (inp_ndim - len(self.idx_list)))
# This case fails when we use pygpu_index(), so here is some
# special code
if len(idx_list) == 0:
return """
Py_XDECREF(%(out)s);
%(out)s = pygpu_copy(%(inp)s, GA_ANY_ORDER);
if (!%(out)s) { %(fail)s }
""" % dict(out=outputs[0], inp=inp, fail=sub['fail'])
sio = StringIO.StringIO()
print >> sio, """
ssize_t starts[%(sz)s];
ssize_t stops[%(sz)s];
ssize_t steps[%(sz)s];
ssize_t cur;
int err;
if (%(inp)s->ga.nd != %(sz)s) {
PyErr_SetString(PyExc_IndexError, "invalid index");
%(fail)s
}
""" % dict(sz=len(idx_list), inp=inp, fail=sub['fail'])
def fix_idx(idx):
if idx is None:
return "0", 1
elif isinstance(idx, (numpy.integer, int)):
return str(idx), 0
elif isinstance(idx, gof.Type):
return indices.pop(0), 0
else:
assert 0, idx
for i, idx in enumerate(idx_list):
if isinstance(idx, slice):
start, start_n = fix_idx(idx.start)
stop, stop_n = fix_idx(idx.stop)
step, step_n = fix_idx(idx.step)
print >>sio, """
starts[%(i)s] = %(start)s;
stops[%(i)s] = %(stop)s;
steps[%(i)s] = %(step)s;
if (fix_indices(&starts[%(i)s], &stops[%(i)s], &steps[%(i)s],
%(start_n)s, %(stop_n)s, %(step_n)s,
%(inp)s->ga.dimensions[%(i)s]) == -1) {
%(fail)s
}
""" % dict(i=i, start=start, stop=stop, step=step,
start_n=start_n, stop_n=stop_n, step_n=step_n,
fail=sub['fail'], inp=inp)
else:
if isinstance(idx, gof.Type):
start = indices.pop(0)
elif isinstance(idx, (numpy.integer, int)):
start = idx
else:
assert 0, idx
print >>sio, """
cur = %(start)s;
if (cur < 0)
cur += %(inp)s->ga.dimensions[%(i)s];
starts[%(i)s] = cur;
steps[%(i)s] = 0;
""" % dict(i=i, start=start, fail=sub['fail'], inp=inp)
print >>sio, """
Py_XDECREF(%(out)s);
%(out)s = pygpu_index(%(inp)s, starts, stops, steps);
if (!%(out)s) { %(fail)s }
""" % dict(name=name, fail=sub['fail'], inp=inp, out=outputs[0])
return sio.getvalue()
def c_code_cache_version(self):
return (5,)
from unittest import TestCase
from theano.tensor.blas import gemv_inplace, gemm_inplace
from theano.sandbox.gpuarray.tests.test_basic_ops import makeTester, rand
from theano.sandbox.gpuarray.blas import (gpugemv_inplace,
gpugemm_inplace)
GpuGemvTester = makeTester('GpuGemvTester',
op=gemv_inplace, gpu_op=gpugemv_inplace,
cases=dict(
dot_vv=[rand(1), 1, rand(1, 2), rand(2), 0],
dot_vm=[rand(3), 1, rand(3, 2), rand(2), 0],
# test_02=[rand(0), 1, rand(0, 2), rand(2), 0],
# test_30=[rand(3), 1, rand(3, 0), rand(0), 0],
# test_00=[rand(0), 1, rand(0, 0), rand(0), 0],
test_stride=[rand(3)[::-1], 1, rand(3, 2)[::-1], rand(2)[::-1], 0],
)
)
GpuGemmTester = makeTester('GpuGemmTester',
op=gemm_inplace, gpu_op=gpugemm_inplace,
cases=dict(
test1=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 0.0],
test2=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), 1.0],
test3=[rand(3, 4), 1.0, rand(3, 5), rand(5, 4), -1.0],
test4=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.0],
test5=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), 0.6],
test6=[rand(3, 4), 0.0, rand(3, 5), rand(5, 4), -1.0],
test7=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 0.0],
test8=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), 1.0],
test9=[rand(3, 4), -1.0, rand(3, 5), rand(5, 4), -1.0],
)
)
import unittest
from theano import scalar, gof
from theano.gof import FunctionGraph
from theano.gof.python25 import all, any
from theano.tests.unittest_tools import SkipTest
from theano.tensor.tests.test_elemwise import (test_Broadcast, test_DimShuffle,
test_CAReduce)
from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, GpuDimShuffle,
GpuCAReduce)
from theano.sandbox.gpuarray.type import GpuArrayType
from pygpu.array import gpuarray
# This is acutally a test for GpuElemwise
class test_gpu_Broadcast(test_Broadcast):
op = GpuElemwise
type = GpuArrayType
def rand_val(self, shp):
return rand_gpuarray(*shp, **dict(cls=gpuarray))
# no c_code() yet
#cop = GpuElemwise
#ctype = GpuArrayType
#def rand_cval(self, shp):
# return rand_gpuarray(*shp, **dict(cls=gpuarray))
class test_GpuDimShuffle(test_DimShuffle):
op = GpuDimShuffle
class test_GpuCAReduce(test_CAReduce):
dtypes = ["float32"]
bin_dtypes = ["uint8", "int8"]
op = GpuCAReduce
reds = [scalar.add, scalar.mul]
def test_perform(self):
for dtype in self.dtypes + self.bin_dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype)
def test_perform_nan(self):
for dtype in self.dtypes:
for op in self.reds:
self.with_linker(gof.PerformLinker(), op, dtype=dtype,
test_nan=True)
def test_c(self):
raise SkipTest("no C code")
def test_c_nan(self):
raise SkipTest("no C code")
import numpy
import theano
from theano.tests import unittest_tools as utt
from theano.sandbox.gpuarray.basic_ops import GpuReshape
import theano.sandbox.gpuarray
if theano.sandbox.gpuarray.pygpu is None:
raise SkipTest("pygpu not installed")
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available and not theano.sandbox.gpuarray.pygpu_activated:
if not cuda_ndarray.use.device_number:
cuda_ndarray.use('gpu')
theano.sandbox.gpuarray.init_dev('cuda')
if not theano.sandbox.gpuarray.pygpu_activated:
raise SkipTest("pygpu disabled")
if theano.config.mode == 'FAST_COMPILE':
mode_with_gpu = theano.compile.mode.get_mode('FAST_RUN').including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_mode('FAST_RUN').excluding('gpuarray')
else:
mode_with_gpu = theano.compile.mode.get_default_mode().including('gpuarray').excluding('gpu')
mode_without_gpu = theano.compile.mode.get_default_mode().excluding('gpuarray')
def test_flatten():
m = theano.tensor.fmatrix()
f = theano.function([m], m.flatten(), mode=mode_with_gpu)
val = numpy.random.rand(10,11).astype("float32")
res = f(val)
utt.assert_allclose(res, val.flatten())
assert res.shape == val.flatten().shape
assert GpuReshape in [type(node.op)
for node in f.maker.fgraph.toposort()]
\ No newline at end of file
from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
from theano.sandbox.gpuarray.tests.test_basic_ops import mode_with_gpu
from theano.compile import DeepCopyOp
from theano import tensor
class G_subtensor(T_subtensor):
def shortDescription(self):
return None
def __init__(self, name):
T_subtensor.__init__(self, name,
shared=gpuarray_shared_constructor,
sub=GpuSubtensor,
mode=mode_with_gpu,
# avoid errors with limited devices
dtype='float32',
ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp))
assert self.sub == GpuSubtensor
import operator
import theano
from theano.compile import DeepCopyOp
from theano.sandbox.gpuarray.tests.test_basic_ops import rand_gpuarray
from theano.sandbox.gpuarray.type import GpuArrayType
def test_deep_copy():
a = rand_gpuarray(20, dtype='float32')
g = GpuArrayType(dtype='float32', broadcastable=(False,))('g')
f = theano.function([g], g)
assert isinstance(f.maker.fgraph.toposort()[0].op, DeepCopyOp)
res = f(a)
assert GpuArrayType.values_eq(res, a)
import numpy
import theano
from theano.tensor.var import _tensor_py_operators
from theano import Type, Variable, Constant, tensor, config, scalar
from theano.compile import SharedVariable
......@@ -26,7 +27,10 @@ class GpuArrayType(Type):
except gpuarray.GpuArrayException:
raise TypeError("Unsupported dtype for %s: %s" %
(self.__class__.__name__, self.dtype))
def __str__(self):
return "GpuArrayType(%s, %s)" % (self.dtype, self.broadcastable)
def filter(self, data, strict=False, allow_downcast=None):
if strict:
if not isinstance(data, gpuarray.GpuArray):
......@@ -103,8 +107,8 @@ class GpuArrayType(Type):
return GpuArrayType.values_eq(a, b)
else:
res = elemwise2(a, '', b, a, odtype=numpy.dtype('bool'),
op_tmpl="res[i] = ((%(a)s - %(b)s) <" \
"(1e-8 + 1e-5 * fabs(%(b)s)))")
op_tmpl="res[i] = ((%(a)s - %(b)s) <"
"(1e-8 + 1e-5 * fabs(%(b)s)))")
return numpy.asarray(res).all()
def value_zeros(self, shape):
......@@ -134,7 +138,7 @@ class GpuArrayType(Type):
return numpy.dtype(self.dtype).itemsize
def c_declare(self, name, sub):
return "GpuArrayObject *%s;" % (name,)
return "PyGpuArrayObject *%s;" % (name,)
def c_init(self, name, sub):
return "%s = NULL;" % (name,)
......@@ -149,17 +153,17 @@ class GpuArrayType(Type):
}
/* First check if we are the base type exactly (the most common case),
then do the full subclass check if needed. */
if (py_%(name)s->ob_type != &GpuArrayType &&
!PyObject_TypeCheck(py_%(name)s, &GpuArrayType)) {
if (py_%(name)s->ob_type != &PyGpuArrayType &&
!PyObject_TypeCheck(py_%(name)s, &PyGpuArrayType)) {
PyErr_SetString(PyExc_ValueError, "expected a GpuArray");
%(fail)s
}
%(name)s = (GpuArrayObject *)py_%(name)s;
%(name)s = (PyGpuArrayObject *)py_%(name)s;
Py_INCREF(%(name)s);
""" % {'name': name, 'fail': sub['fail']}
def c_cleanup(self, name, sub):
return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name }
return "Py_XDECREF(%(name)s); %(name)s = NULL;" % {'name': name}
def c_sync(self, name, sub):
return """
......@@ -184,7 +188,8 @@ class GpuArrayType(Type):
# We need arrayobject for the PyArrayDescr struct def
# (even if we just use a pointer to it in a function def)
return ['<compyte/array.h>', '<compyte/kernel.h>', '<compyte/error.h>',
'<numpy/arrayobject.h>', '<gpuarray_api.h>']
'<compyte/buffer_blas.h>', '<numpy/arrayobject.h>',
'<gpuarray_api.h>']
def c_header_dirs(self):
return [pygpu.get_include(), numpy.get_include()]
......@@ -193,10 +198,13 @@ class GpuArrayType(Type):
return ['compyte']
def c_code_cache_version(self):
return (1,)
ver = pygpu.gpuarray.api_version()
# we only use the major version since the minor revision are
# API-compatible.
return (1, ver[0])
class _operators(tensor.basic._tensor_py_operators):
class _operators(_tensor_py_operators):
def _as_TensorVariable(self):
from basic_ops import host_from_gpu
return host_from_gpu(self)
......@@ -204,10 +212,6 @@ class _operators(tensor.basic._tensor_py_operators):
def _as_GpuArrayVariable(self):
return self
dtype = property(lambda s: s.type.dtype)
broadcastable = property(lambda s: s.type.broadcastable)
ndim = property(lambda s: s.type.ndim)
class GpuArrayVariable(_operators, Variable):
pass
......@@ -276,12 +280,6 @@ theano.compile.register_view_op_c_code(GpuArrayType, """
theano.compile.register_deep_copy_op_c_code(GpuArrayType, """
Py_XDECREF(%(oname)s);
%(oname)s = new_GpuArray((PyObject *)&GpuArrayType, GpuArray_default_context());
%(oname)s = pygpu_copy(%(iname)s, GA_ANY_ORDER);
if (!%(oname)s) { %(fail)s }
int err;
err = GpuArray_copy(&%(oname)s->ga, &%(iname)s->ga, GA_ANY_ORDER);
if (err != GA_NO_ERROR) {
PyErr_SetString(PyExc_RuntimeError, "Error during copy");
%(fail)s
}
""", version=(1,))
""", version=(5,))
......@@ -3018,7 +3018,7 @@ class Composite(ScalarOp):
rval = []
for subnode in self.fgraph.toposort():
try:
rval.append(subnode.op.c_support_code())
rval.append(subnode.op.c_support_code().strip())
except gof.utils.MethodNotDefined:
pass
# remove duplicate code blocks
......
......@@ -143,10 +143,6 @@ class DimShuffle(Op):
# list of dimensions of the input to drop
self.drop = []
# this maps i before dropping dimensions to j after dropping dimensions
# so self.shuffle can be set properly later on
i2j = {}
j = 0
for i, b in enumerate(input_broadcastable):
if i not in new_order:
# we want to drop this dimension because it's not a value in
......@@ -158,14 +154,9 @@ class DimShuffle(Op):
raise ValueError(
"You cannot drop a non-broadcastable dimension.",
(input_broadcastable, new_order))
else:
i2j[i] = j
j += 1
# transposition of non-broadcastable dimensions
# This is how the dimensions will be permuted, without accounting for
# the extra 'x' broadcastable dimensions to insert.
self.shuffle = [i2j[x] for x in new_order if x != 'x']
# this is the list of the original dimensions that we keep
self.shuffle = [x for x in new_order if x != 'x']
# list of dimensions of the output that are broadcastable and were not
# in the original input
......@@ -237,16 +228,12 @@ class DimShuffle(Op):
res = input
if type(res) != numpy.ndarray and type(res) != numpy.memmap:
raise TypeError(res)
shape = list(res.shape)
for drop in reversed(self.drop):
shape.pop(drop)
res = res.reshape(shape)
# transpose
res = res.transpose(self.shuffle)
res = res.transpose(self.shuffle+self.drop)
# augment
shape = list(res.shape)
shape = list(res.shape[:len(self.shuffle)])
for augm in self.augment:
shape.insert(augm, 1)
res = res.reshape(shape)
......@@ -259,9 +246,6 @@ class DimShuffle(Op):
def infer_shape(self, node, shapes):
ishp, = shapes
ishp = list(ishp)
for drop in reversed(self.drop):
del ishp[drop]
# transpose
rval = [ishp[i] for i in self.shuffle]
......
......@@ -410,9 +410,9 @@ def local_dimshuffle_lift(node):
inode = input.owner
if inode and isinstance(inode.op, Elemwise) and (len(input.clients) == 1):
# Don't use make_node to have tag.test_value set.
ret = inode.op(*[DimShuffle(inp.type.broadcastable,
op.new_order,
op.inplace)(inp) for inp in
ret = inode.op(*[op.__class__(inp.type.broadcastable,
op.new_order,
op.inplace)(inp) for inp in
inode.inputs], **dict(return_list=True))
return ret
if inode and isinstance(inode.op, DimShuffle):
......@@ -424,8 +424,8 @@ def local_dimshuffle_lift(node):
iinput.type.ndim):
return [iinput]
else:
ret = DimShuffle(iinput.type.broadcastable, new_order,
inplace)(iinput, **dict(return_list=True))
ret = op.__class__(iinput.type.broadcastable, new_order,
inplace)(iinput, **dict(return_list=True))
return ret
......@@ -460,7 +460,7 @@ def dimshuffle_as_view(node):
op = node.op
if not isinstance(op, DimShuffle) or op.inplace:
return False
new_op = DimShuffle(op.input_broadcastable, op.new_order, inplace=True)
new_op = op.__class__(op.input_broadcastable, op.new_order, inplace=True)
return [new_op(*node.inputs)]
#Step 60 is the inplace optimization stage.
......@@ -4609,7 +4609,7 @@ def local_elemwise_fusion_op(OP, max_input_fct=lambda node: 1024):
# worthwhile if the summation axis doesn't line up with a
# contiguous dimension)
if not isinstance(node.op, OP):
if type(node.op) is not OP:
return False
inputs = [] # inputs of the new Elemwise op.
s_inputs = [] # inputs of the new scalar op used by the Composite.
......
......@@ -44,7 +44,7 @@ from theano.tensor import (_shared, wvector, bvector, autocast_float_as,
dtensor3, SpecifyShape, Mean,
itensor3, Tile, switch, Diagonal, Diag,
nonzero, flatnonzero, nonzero_values,
stacklists)
stacklists, DimShuffle)
from theano.tests import unittest_tools as utt
......@@ -4204,9 +4204,30 @@ class T_op_cache(unittest.TestCase):
self.assertTrue(numpy.all(fn_py(a) == fn_c_or_py(a)))
class T_reshape(unittest.TestCase):
def setUp(self):
utt.seed_rng()
class T_reshape(utt.InferShapeTester, utt.TestOptimizationMixin):
def __init__(self, name, shared=tensor._shared, op=Reshape, mode=None,
ignore_topo=(DeepCopyOp, opt.MakeVector,
opt.Shape_i, DimShuffle, theano.tensor.Elemwise)):
self.shared = shared
self.op = op
#The tag canonicalize is needed for the shape test in FAST_COMPILE
self.mode = mode
self.ignore_topo = ignore_topo
return super(T_reshape, self).__init__(name)
def function(self, inputs, outputs):
f = function(inputs, outputs, mode=self.mode)
if self.mode is not None or theano.config.mode != "FAST_COMPILE":
topo = f.maker.fgraph.toposort()
topo_ = [node for node in topo if not isinstance(node.op,
self.ignore_topo)]
assert len(topo_) == 1, topo_
return f
def eval_output_and_check(self, t):
f = self.function([], t)
tval = f()
return tval
def test_reshape(self):
a = dvector()
......@@ -4215,7 +4236,7 @@ class T_reshape(unittest.TestCase):
#basic to 1 dim(without list)
c = reshape(b, as_tensor_variable(6), ndim=1)
f = inplace_func([b], c)
f = self.function([b], c)
b_val1 = numpy.asarray([[0, 1, 2], [3, 4, 5]])
c_val1 = numpy.asarray([0, 1, 2, 3, 4, 5])
......@@ -4231,7 +4252,7 @@ class T_reshape(unittest.TestCase):
#basic to 1 dim(with list)
c = reshape(b, (as_tensor_variable(6),), ndim=1)
f = inplace_func([b], c)
f = self.function([b], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
numpy.asarray([0, 1, 2, 3, 4, 5]))
#print f.maker.fgraph.toposort()
......@@ -4239,14 +4260,14 @@ class T_reshape(unittest.TestCase):
#basic to shape object of same ndim
c = reshape(b, d.shape)
f = inplace_func([b, d], c)
f = self.function([b, d], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]]),
[[0, 1], [2, 3], [4, 5]]) ==
numpy.asarray([[0, 1], [2, 3], [4, 5]]))
#basic to 2 dims
c = reshape(a, [2, 3])
f = inplace_func([a], c)
f = self.function([a], c)
assert numpy.all(f(numpy.asarray([0, 1, 2, 3, 4, 5])) ==
numpy.asarray([[0, 1, 2], [3, 4, 5]]))
......@@ -4255,7 +4276,7 @@ class T_reshape(unittest.TestCase):
a_val_copy = numpy.asarray([0, 1, 2, 3, 4, 5])
b_val = numpy.asarray([[0, 1, 2], [3, 4, 5]])
f_sub = inplace_func([a, b], c - b)
f_sub = self.function([a, b], c - b)
assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(a_val == a_val_copy)
......@@ -4264,35 +4285,33 @@ class T_reshape(unittest.TestCase):
a_val_copy = theano._asarray([0, 1, 2, 3, 4, 5], dtype='float64')
b_val = theano._asarray([[0, 1, 2], [3, 4, 5]], dtype='float64')
f_sub = inplace_func([a, b], c - b)
f_sub = self.function([a, b], c - b)
assert numpy.all(f_sub(a_val, b_val) == 0.0)
assert numpy.all(a_val == a_val_copy)
# verify gradient
def just_vals(v):
return Reshape(2)(v, theano._asarray([2, 3], dtype='int32'))
utt.verify_grad(just_vals, [a_val])
utt.verify_grad(just_vals, [a_val], mode=self.mode)
#test infer_shape
f_sub = function([a, b], (c - b).shape)
if config.mode == "FAST_COMPILE":
assert len(f_sub.maker.fgraph.toposort()) == 3
else:
topo = f_sub.maker.fgraph.toposort()
assert len(topo) == 1
topo[0].op == theano.compile.function_module.deep_copy_op
#assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5]]))==[2,3])#work in FAST_RUN, but fail on other!
#assert numpy.all(f_sub(a_val,numpy.asarray([[0,1],[2,3],[4,5],[6,7]]))==[2,3])#work in FAST_RUN, but fail on other!
self._compile_and_check([a], [c], (a_val,), self.op)
# test broadcast flag for constant value of 1
c = reshape(b, (b.shape[0], b.shape[1], 1))
f = inplace_func([b], c)
f = self.function([b], c)
assert numpy.all(f(numpy.asarray([[0, 1, 2], [3, 4, 5]])) ==
numpy.asarray([[[0], [1], [2]], [[3], [4], [5]]]))
assert (f.maker.fgraph.toposort()[-2].outputs[0].type.broadcastable ==
(False, False, True))
assert numpy.all(f_sub(a_val, b_val) == [2, 3])
def test_m1(self):
t = tensor3()
rng = numpy.random.RandomState(seed=utt.fetch_seed())
val = rng.uniform(size=(3, 4, 5)).astype(config.floatX)
for out in [t.reshape([-1]), t.reshape([-1, 5]),
t.reshape([5, -1]), t.reshape([5, -1, 3])]:
self._compile_and_check([t], [out], [val], self.op)
def test_reshape_long_in_shape(self):
v = dvector('v')
......@@ -4311,14 +4330,14 @@ class T_reshape(unittest.TestCase):
r = a.reshape(shapes, ndim=1)
z = zeros_like(r)
f = function([a, shapes], z.shape)
f = self.function([a, shapes], z.shape)
self.assertRaises(ValueError, f, a_val, [13])
#Test reshape to 2 dim
r = a.reshape(shapes, ndim=2)
z = zeros_like(r)
f = function([a, shapes], z.shape)
f = self.function([a, shapes], z.shape)
self.assertRaises(ValueError, f, a_val, [-1, 5])
self.assertRaises(ValueError, f, a_val, [7, -1])
......
......@@ -122,10 +122,9 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
try:
try:
self.eval_output_and_check(t)
assert 0
except Exception, e:
if 'out of bounds' not in exc_message(e):
raise
except IndexError, e:
return
self.fail()
finally:
_logger.setLevel(oldlevel)
......@@ -161,7 +160,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test1_0_dims(self):
n = self.shared(numpy.ones((), dtype=self.dtype))
t = theano.tensor.Subtensor([])(n)
t = self.sub([])(n)
self.assertTrue(isinstance(t.owner.op, Subtensor))
mode = self.mode
self.mode = mode.excluding("local_useless_subtensor")
......@@ -188,7 +187,6 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(tval == 5.0)
def test1_ok_range_infinite(self):
#Subtensor.debug = True
n = self.shared(numpy.arange(3, dtype=self.dtype))
t = n[1:]
self.assertTrue(isinstance(t.owner.op, Subtensor))
......
......@@ -543,8 +543,8 @@ class _tensor_py_operators:
def get_scalar_constant_value(self):
return theano.tensor.basic.get_scalar_constant_value(self)
def zeros_like(self, dtype=None):
return theano.tensor.basic.zeros_like(self, dtype=dtype)
def zeros_like(model, dtype=None):
return theano.tensor.basic.zeros_like(model, dtype=dtype)
class TensorVariable(_tensor_py_operators, Variable):
......
......@@ -182,7 +182,10 @@ class InferShapeTester(unittest.TestCase):
def setUp(self):
seed_rng()
# Take into account any mode that may be defined in a child class
mode = getattr(self, 'mode', theano.compile.get_default_mode())
# and it can be None
mode = getattr(self, 'mode', None)
if mode is None:
mode = theano.compile.get_default_mode()
# This mode seems to be the minimal one including the shape_i
# optimizations, if we don't want to enumerate them explicitly.
self.mode = mode.including("canonicalize")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论