提交 e2bd5b94 authored 作者: Matthew Koichi Grimes's avatar Matthew Koichi Grimes

Merge branch 'master' into sparse_stack_fix

.. _opfromgraph:
===========
OpFromGraph
===========
This page descripbe :class:`theano.OpFromGraph
<theano.compile.builders.OpFromGraph>`. an Op that allow to
encapsulate a Theano graph in an op.
This can be used to encapsulate some functionality in one block. It is
useful to scale Theano compilation for regular bigger graph when we
reuse that encapsulated fonctionality with different inputs many
times. Due to this encapsulation, it can make Theano compilation phase
faster for graph with many nodes.
Using this for small graph isn't recommanded as it disable
optimization between what is inside the encapsulation and outside it.
.. note:
This wasn't used widely up to now. If you have any
questions/comments don't contact us on the mailing list.
.. autoclass:: theano.compile.builders.OpFromGraph
...@@ -9,8 +9,6 @@ from theano.compile.mode import * ...@@ -9,8 +9,6 @@ from theano.compile.mode import *
from theano.compile.io import * from theano.compile.io import *
from theano.compile.builders import *
from theano.compile.module import * from theano.compile.module import *
from theano.compile.debugmode import DebugMode from theano.compile.debugmode import DebugMode
...@@ -25,4 +23,6 @@ from theano.compile.sharedvalue import (shared, shared_constructor, ...@@ -25,4 +23,6 @@ from theano.compile.sharedvalue import (shared, shared_constructor,
SharedVariable) SharedVariable)
from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared from theano.compile.pfunc import pfunc, Param, rebuild_collect_shared
from theano.compile.builders import *
from theano.compile.function import function from theano.compile.function import function
from theano import gof from theano import gof
from theano import gradient as G from theano import gradient as G
from theano.compile.function_module import orig_function from theano.compile.function_module import orig_function
from theano.compile import SharedVariable, rebuild_collect_shared
from theano.gof import ops_with_inner_function from theano.gof import ops_with_inner_function
class OpFromGraph(gof.Op): class OpFromGraph(gof.Op):
""" """This create an `Op` from inputs and outputs list of variables.
This create an L{Op} from a list of input variables and a list of output
variables. The signature is similar to theano.function() and the resulting
`Op` perform will do the same operation as::
The signature is the same as the signature of L{FunctionFactory}
and/or function and the resulting L{Op}'s perform will do the same orig_function(inputs, outputs, **kwargs)
operation as::
function(inputs, outputs, **kwargs) TODO:
- examples for a multi-layer mlp. where?
Take note that the following options, if provided, must take the - __hash__, __eq__ otherwise won't merge, try gof.opt.is_same_graph_with_merge(op1.new_outputs, op2, new_outputs)
value(s) listed below: - c_code() to remove the double overhead?
unpack_single = False - opt to unfold it, work inplace on inputs
borrow_outputs = False - grad() make it support DisconnectedType and the new interface
- check how it work with updates.
OpFromGraph takes an additional input, grad_depth. If grad_depth - add test with constant as input or inside the inner graph.
is n, OpFromGraph will make special Ops for gradients up to the - Add support for the GPU? Probably just need an opt to remove transfer
nth level, allowing the user to differentiate this op up to n - Add support to pickle this Op.
times. The parameter defaults to 1. If grad_depth == 0, the op - Add support/test with random generator
will not be differentiable. :note:
- We support shared variable in the inner graph. This is automatic and
Example: invisible to the user. They can be as input to the node or in the
inner graph.
- We support unused inputs. This is needed for the grad.
Example 1:
.. code-block:: python
from theano import function, OpFromGraph, tensor
x, y, z = tensor.scalars('xyz') x, y, z = tensor.scalars('xyz')
e = x + y * z e = x + y * z
op = OpFromGraph([x, y, z], [e], linker='c') op = OpFromGraph([x, y, z], [e])
# op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2])
Example 2 with shared variable:
.. code-block:: python
import numpy
import theano
from theano import config, function, OpFromGraph, tensor
x, y, z = tensor.scalars('xyz')
s = theano.shared(numpy.random.rand(2, 2).astype(config.floatX))
e = x + y * z + s
op = OpFromGraph([x, y, z], [e])
# op behaves like a normal theano op # op behaves like a normal theano op
e2 = op(x, y, z) + op(z, y, x) e2 = op(x, y, z) + op(z, y, x)
fn = function([x, y, z], [e2]) fn = function([x, y, z], [e2])
""" """
def __init__(self, inputs, outputs, grad_depth=1, **kwargs): def __init__(self, inputs, outputs, **kwargs):
if not isinstance(outputs, list): if not isinstance(outputs, list):
raise TypeError('outputs must be list', outputs) raise TypeError('outputs must be list', outputs)
for i in inputs + outputs: for i in inputs + outputs:
...@@ -44,34 +71,33 @@ class OpFromGraph(gof.Op): ...@@ -44,34 +71,33 @@ class OpFromGraph(gof.Op):
if 'updates' in kwargs: if 'updates' in kwargs:
raise TypeError('updates are not allowed in kwargs') raise TypeError('updates are not allowed in kwargs')
# TODO: the graph may have implicit inputs like # To support correctly shared variables the inner fct should
# SharedVariable instances. # not see them. Otherwise their is problem with the gradient.
# what impact to they have on the validity of this Op? self.shared_inputs = [var for var in gof.graph.inputs(outputs)
self.fn = orig_function(inputs, outputs, **kwargs) if isinstance(var, SharedVariable)]
used_inputs = [var for var in gof.graph.inputs(outputs)
if not isinstance(var, gof.Constant)]
shared_vars = [var.type() for var in self.shared_inputs]
new = rebuild_collect_shared(outputs, inputs=inputs + shared_vars,
replace=dict(zip(self.shared_inputs,
shared_vars)),
copy_inputs_over=False)
(new_inputs, new_outputs,
[clone_d, update_d, update_expr, shared_inputs]) = new
assert len(new_inputs) == len(inputs) + len(self.shared_inputs)
assert len(new_outputs) == len(outputs)
assert not update_d
assert not update_expr
assert not shared_inputs
self.new_inputs = new_inputs
self.new_outputs = new_outputs
self.inputs = inputs self.inputs = inputs
self.outputs = outputs self.outputs = outputs
self.kwargs = kwargs
self.input_types = [input.type for input in inputs] self.input_types = [input.type for input in inputs]
self.output_types = [output.type for output in outputs] self.output_types = [output.type for output in outputs]
if grad_depth > 0:
output_grads = [t() for t in self.output_types]
# OpFromGraph doesn't implement a connection_pattern, so for now we regard
# all inputs and outputs as connected. This will compute the right numerical
# value for the gradients but could fail to raise the disconnected inputs error
# in some cases.
gs = G.grad(cost=None, known_grads=dict(zip(self.outputs, output_grads)),
wrt=self.inputs, disconnected_inputs='ignore')
self.grad_ops = []
for g in gs:
if g is None:
self.grad_ops.append(lambda *args: None)
else:
# It is normal if some inputs are not needed in order
# to compute the gradient, so we ignore them.
self.grad_ops.append(OpFromGraph(inputs + output_grads,
[g],
grad_depth=grad_depth - 1,
on_unused_input='ignore'))
def __eq__(self, other): def __eq__(self, other):
#TODO: recognize a copy #TODO: recognize a copy
...@@ -87,9 +113,18 @@ class OpFromGraph(gof.Op): ...@@ -87,9 +113,18 @@ class OpFromGraph(gof.Op):
raise TypeError("Wrong type, expected %s but got %s" raise TypeError("Wrong type, expected %s but got %s"
% (type, input.type)) % (type, input.type))
return gof.Apply(self, return gof.Apply(self,
inputs, list(inputs) + self.shared_inputs,
[type() for type in self.output_types]) [type() for type in self.output_types])
def make_thunk(self, node, storage_map, compute_map, no_recycling):
ret = super(OpFromGraph, self).make_thunk(node, storage_map,
compute_map, no_recycling)
if not hasattr(self, "fn"):
self.fn = orig_function(self.new_inputs,
self.new_outputs,
**self.kwargs)
return ret
def perform(self, node, inputs, outputs): def perform(self, node, inputs, outputs):
variables = self.fn(*inputs) variables = self.fn(*inputs)
assert len(variables) == len(outputs) assert len(variables) == len(outputs)
...@@ -99,10 +134,32 @@ class OpFromGraph(gof.Op): ...@@ -99,10 +134,32 @@ class OpFromGraph(gof.Op):
output[0] = variable.copy() output[0] = variable.copy()
def grad(self, inputs, output_grads): def grad(self, inputs, output_grads):
if hasattr(self, 'grad_ops'): # OpFromGraph doesn't implement a connection_pattern, so for
return [go(*(inputs + output_grads)) for go in self.grad_ops] # now we regard all inputs and outputs as connected. This will
# compute the right numerical value for the gradients but
# could fail to raise the disconnected inputs error in some
# cases.
if hasattr(self, "grad_ops"):
grad_ops = self.grad_ops
else: else:
raise NotImplementedError gs = G.grad(cost=None,
known_grads=dict(zip(self.new_outputs, output_grads)),
wrt=self.new_inputs,
disconnected_inputs='ignore')
grad_ops = []
for g in gs:
if g is None:
grad_ops.append(lambda *args: None)
else:
# It is normal if some inputs are not needed in order
# to compute the gradient, so we ignore them.
grad_ops.append(OpFromGraph(self.new_inputs + output_grads,
[g],
on_unused_input='ignore'))
self.grad_ops = grad_ops
return [go(*(inputs + output_grads)) for go in grad_ops]
# Since OpFromGraph contains a Theano compiled function, we should let # Since OpFromGraph contains a Theano compiled function, we should let
# DebugMode know about it # DebugMode know about it
......
...@@ -1036,7 +1036,7 @@ class FunctionMaker(object): ...@@ -1036,7 +1036,7 @@ class FunctionMaker(object):
# initialize the linker # initialize the linker
if not hasattr(linker, 'accept'): if not hasattr(linker, 'accept'):
raise ValueError("'linker' parameter of FunctionFactory should be a Linker with an accept method " \ raise ValueError("'linker' parameter of FunctionMaker should be a Linker with an accept method " \
"or one of %s" % theano.compile.mode.predefined_linkers.keys()) "or one of %s" % theano.compile.mode.predefined_linkers.keys())
#the 'no_borrow' outputs are the ones for which that we can't return the internal storage pointer. #the 'no_borrow' outputs are the ones for which that we can't return the internal storage pointer.
......
import numpy import numpy
import unittest import unittest
from theano import config from theano import config, shared
from theano.compile import function from theano.compile import function
...@@ -17,7 +17,9 @@ class T_OpFromGraph(unittest.TestCase): ...@@ -17,7 +17,9 @@ class T_OpFromGraph(unittest.TestCase):
x, y, z = T.matrices('xyz') x, y, z = T.matrices('xyz')
e = x + y * z e = x + y * z
op = OpFromGraph([x, y, z], [e], mode='FAST_RUN') op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
f = op(x, y, z) - op(y, z, x) # (1+3*5=array of 16) - (3+1*5=array of 8) # (1+3*5=array of 16) - (3+1*5=array of 8)
f = op(x, y, z) - op(y, z, x)
fn = function([x, y, z], f) fn = function([x, y, z], f)
xv = numpy.ones((2, 2), dtype=config.floatX) xv = numpy.ones((2, 2), dtype=config.floatX)
yv = numpy.ones((2, 2), dtype=config.floatX)*3 yv = numpy.ones((2, 2), dtype=config.floatX)*3
...@@ -47,7 +49,7 @@ class T_OpFromGraph(unittest.TestCase): ...@@ -47,7 +49,7 @@ class T_OpFromGraph(unittest.TestCase):
def test_grad(self): def test_grad(self):
x, y, z = T.matrices('xyz') x, y, z = T.matrices('xyz')
e = x + y * z e = x + y * z
op = OpFromGraph([x, y, z], [e], mode='FAST_RUN', grad_depth=2) op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
f = op(x, y, z) f = op(x, y, z)
f = f - T.grad(T.sum(f), y) f = f - T.grad(T.sum(f), y)
fn = function([x, y, z], f) fn = function([x, y, z], f)
...@@ -56,6 +58,56 @@ class T_OpFromGraph(unittest.TestCase): ...@@ -56,6 +58,56 @@ class T_OpFromGraph(unittest.TestCase):
zv = numpy.ones((2, 2), dtype=config.floatX)*5 zv = numpy.ones((2, 2), dtype=config.floatX)*5
assert numpy.all(11.0 == fn(xv, yv, zv)) assert numpy.all(11.0 == fn(xv, yv, zv))
def test_grad_grad(self):
x, y, z = T.matrices('xyz')
e = x + y * z
op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
f = op(x, y, z)
f = f - T.grad(T.sum(f), y)
f = f - T.grad(T.sum(f), y)
fn = function([x, y, z], f)
xv = numpy.ones((2, 2), dtype=config.floatX)
yv = numpy.ones((2, 2), dtype=config.floatX)*3
zv = numpy.ones((2, 2), dtype=config.floatX)*5
assert numpy.allclose(6.0, fn(xv, yv, zv))
def test_shared(self):
x, y, z = T.matrices('xyz')
s = shared(numpy.random.rand(2, 2).astype(config.floatX))
e = x + y * z + s
op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
# (1+3*5=array of 16) - (3+1*5=array of 8)
f = op(x, y, z) - op(y, z, x)
fn = function([x, y, z], f)
xv = numpy.ones((2, 2), dtype=config.floatX)
yv = numpy.ones((2, 2), dtype=config.floatX)*3
zv = numpy.ones((2, 2), dtype=config.floatX)*5
#print function, function.__module__
#print fn.maker.fgraph.toposort()
assert numpy.allclose(8.0, fn(xv, yv, zv))
assert numpy.allclose(8.0, fn(xv, yv, zv))
def test_shared_grad(self):
x, y, z = T.matrices('xyz')
s = shared(numpy.random.rand(2, 2).astype(config.floatX))
e = x + y * z + s
op = OpFromGraph([x, y, z], [e], mode='FAST_RUN')
f = op(x, y, z)
f = f - T.grad(T.sum(f), y)
fn = function([x, y, z], f)
xv = numpy.ones((2, 2), dtype=config.floatX)
yv = numpy.ones((2, 2), dtype=config.floatX) * 3
zv = numpy.ones((2, 2), dtype=config.floatX) * 5
assert numpy.allclose(11.0 + s.get_value(), fn(xv, yv, zv))
# grad again the shared variable
f = op(x, y, z)
f = f - T.grad(T.sum(f), s)
fn = function([x, y, z], f)
assert numpy.allclose(15.0 + s.get_value(),
fn(xv, yv, zv))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2794,20 +2794,7 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals() return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def set_view_base(self, x, fail): def add_to_zview(self, name, x, fail):
return """
//Set the base only now
if(CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(zview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(zview);
%(fail)s;
}""" % locals()
def add_to_zview(self, x, fail):
return """ return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview, PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
......
...@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise): ...@@ -149,9 +149,19 @@ class GpuElemwise(HideC, Elemwise):
#define ga_double double #define ga_double double
#define ga_half uint16_t #define ga_half uint16_t
#include <Python.h>
#include <numpy/npy_common.h>
""" """
for npy, ga in [("npy_uint8", "ga_ubyte"),
("npy_uint16", "ga_ushort"),
("npy_uin32", "ga_uint"),
("npy_uin64", "ga_ulong"),
("npy_int8", "ga_byte"),
("npy_int16", "ga_short"),
("npy_int32", "ga_int"),
("npy_int64", "ga_long"),
("npy_float32", "ga_float"),
("npy_float64", "ga_double"),
]:
kop = kop.replace(npy, ga)
return ElemwiseKernel(None, inps+outs, kop, preamble=support_code) return ElemwiseKernel(None, inps+outs, kop, preamble=support_code)
def c_headers(self): def c_headers(self):
...@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise): ...@@ -165,9 +175,34 @@ class GpuElemwise(HideC, Elemwise):
# implementation # implementation
k = self.generate_kernel(node, nodename) k = self.generate_kernel(node, nodename)
nd = node.inputs[0].type.ndim nd = node.inputs[0].type.ndim
import pycuda._cluda CLUDA_PREAMBLE = """
#define local_barrier() __syncthreads();
#define WITHIN_KERNEL __device__
#define KERNEL extern "C" __global__
#define GLOBAL_MEM /* empty */
#define LOCAL_MEM __shared__
#define LOCAL_MEM_ARG /* empty */
#define REQD_WG_SIZE(X,Y,Z) __launch_bounds__(X*Y*Z, 1)
#define LID_0 threadIdx.x
#define LID_1 threadIdx.y
#define LID_2 threadIdx.z
#define GID_0 blockIdx.x
#define GID_1 blockIdx.y
#define GID_2 blockIdx.z
#define LDIM_0 blockDim.x
#define LDIM_1 blockDim.y
#define LDIM_2 blockDim.z
#define GDIM_0 gridDim.x
#define GDIM_1 gridDim.y
#define GDIM_2 gridDim.z
"""
res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);", res = ["CUdeviceptr (*cuda_get_ptr)(gpudata *g);",
pycuda._cluda.CLUDA_PREAMBLE] CLUDA_PREAMBLE]
for i in range(0, nd + 1): for i in range(0, nd + 1):
res.append(k.render_basic(i, name="elem_" + str(i)) + ';') res.append(k.render_basic(i, name="elem_" + str(i)) + ';')
res.append(k.contig_src + ';') res.append(k.contig_src + ';')
...@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise): ...@@ -338,8 +373,8 @@ class GpuElemwise(HideC, Elemwise):
node.inputs + node.outputs)): node.inputs + node.outputs)):
if (n - len(inputs)) in self.inplace_pattern: if (n - len(inputs)) in self.inplace_pattern:
continue continue
dtype = var.dtype dtype = dtype_to_ctype(var.dtype)
param.append("(npy_%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals())
param.append("%(name)s->ga.offset" % locals()) param.append("%(name)s->ga.offset" % locals())
for i in range(nd): for i in range(nd):
param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals())
......
...@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi ...@@ -24,7 +24,7 @@ from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBi
GpuCrossentropySoftmax1HotWithBiasDx) GpuCrossentropySoftmax1HotWithBiasDx)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar, from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
GpuDimShuffle, GpuCAReduce) GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import GpuArrayConstant from theano.sandbox.gpuarray.type import GpuArrayConstant
gpu_optimizer = EquilibriumDB() gpu_optimizer = EquilibriumDB()
...@@ -234,6 +234,14 @@ def local_gpua_subtensor(node): ...@@ -234,6 +234,14 @@ def local_gpua_subtensor(node):
return GpuSubtensor(node.op.idx_list) return GpuSubtensor(node.op.idx_list)
@register_opt()
@op_lifter([tensor.IncSubtensor])
def local_gpua_incsubtensor(node):
return GpuIncSubtensor(node.op.idx_list, node.op.inplace,
node.op.set_instead_of_inc,
node.op.destroyhandler_tolerate_aliased)
@register_opt() @register_opt()
@op_lifter([tensor.CAReduce, tensor.Sum]) @op_lifter([tensor.CAReduce, tensor.Sum])
def local_gpua_careduce(node): def local_gpua_careduce(node):
......
import copy
import StringIO import StringIO
import numpy import numpy
import theano import theano
from theano import tensor, gof from theano import tensor, gof
from theano.tensor.subtensor import Subtensor, get_idx_list
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
from theano.tensor.subtensor import IncSubtensor, Subtensor, get_idx_list
from theano.sandbox.cuda.nvcc_compiler import NVCC_compiler
try: try:
import pygpu import pygpu
...@@ -16,6 +17,7 @@ except ImportError: ...@@ -16,6 +17,7 @@ except ImportError:
from theano.sandbox.gpuarray.type import GpuArrayType from theano.sandbox.gpuarray.type import GpuArrayType
from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC from theano.sandbox.gpuarray.basic_ops import as_gpuarray_variable, HideC
from theano.sandbox.gpuarray.elemwise import GpuElemwise
class GpuSubtensor(HideC, Subtensor): class GpuSubtensor(HideC, Subtensor):
...@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor): ...@@ -154,3 +156,203 @@ class GpuSubtensor(HideC, Subtensor):
def c_code_cache_version(self): def c_code_cache_version(self):
return (5,) return (5,)
class GpuIncSubtensor(IncSubtensor):
"""
Implement IncSubtensor on the gpu.
Note: The optimization to make this inplace is in tensor/opt.
The same optimization handles IncSubtensor and GpuIncSubtensor.
This Op has c_code too; it inherits tensor.IncSubtensor's c_code.
The helper methods like do_type_checking, copy_of_x, etc. specialize
the c_code for this Op.
"""
def c_headers(self):
return self.iadd_node.op.c_headers()
def c_compiler(self):
return self.iadd_node.op.c_compiler()
def c_init_code(self):
return self.iadd_node.op.c_init_code()
def make_node(self, x, y, *inputs):
x = as_gpuarray_variable(x)
y = as_gpuarray_variable(y)
rval = tensor.IncSubtensor.make_node(self, x, y, *inputs)
op = copy.copy(self)
ret = gof.Apply(op, [x, y] + rval.inputs[2:], [x.type()])
op.create_iadd_node(ret)
return ret
def create_iadd_node(self, node):
# We store a iadd_node in the op that contain the info needed
# for the inplace add.
cop = theano.tensor.inplace.add_inplace
gop = GpuElemwise(cop.scalar_op, copy.copy(cop.inplace_pattern),
"Gpu" + cop.name, cop.nfunc_spec)
y = node.inputs[1]
xview = y.type()
iadd_node = gop(xview, y).owner
self.iadd_node = iadd_node
def perform(self, node, inputs, out_):
out, = out_
x, y = inputs[:2]
indices = list(reversed(inputs[2:]))
def convert(entry):
if isinstance(entry, gof.Type):
rval = indices.pop()
return rval
elif isinstance(entry, slice):
return slice(convert(entry.start),
convert(entry.stop),
convert(entry.step))
else:
return entry
cdata = tuple(map(convert, self.idx_list))
if len(cdata) == 1:
cdata = cdata[0]
if not self.inplace:
x = x.copy()
sub_x = x.__getitem__(cdata)
if sub_x.shape:
# we've sliced out an N-D tensor with N > 0
if not self.set_instead_of_inc:
#sub_x += y
pygpu.elemwise.ielemwise2(sub_x, '+', y, broadcast=False)
else:
#sub_x += -sub_x + y
x.__setitem__(cdata, y)
else:
# scalar case
if not self.set_instead_of_inc:
#x.__setitem__(cdata, sub_x + y)
tmp = pygpu.elemwise.elemwise2(sub_x, '+', y, sub_x, broadcast=False)
x.__setitem__(cdata, tmp)
else:
x.__setitem__(cdata, y)
out[0] = x
def __setstate__(self, d):
self.__dict__.update(d)
owner = getattr(self.__dict__, "owner", None)
if owner:
op.create_iadd_node(owner)
def __getstate__(self):
d = copy.copy(self.__dict__)
if "iadd_node" in d:
d.pop('iadd_node')
return d
def do_type_checking(self, node):
""" Should raise NotImplementedError if c_code does not support
the types involved in this node.
"""
if not isinstance(node.inputs[0].type, GpuArrayType):
raise NotImplementedError()
def copy_of_x(self, x):
"""
:param x: a string giving the name of a C variable
pointing to an array
:return: C code expression to make a copy of x
Base class uses `PyArrayObject *`, subclasses may override for
different types of arrays.
"""
return """pygpu_copy(%(x)s, GA_ANY_ORDER)""" % locals()
def decl_view(self):
return "PyGpuArrayObject* zview = NULL;"
def make_view_array(self, x, view_ndim):
"""//TODO
:param x: a string identifying an array to be viewed
:param view_ndim: a string specifying the number of dimensions
to have in the view
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
ret = """
size_t dims[%(view_ndim)s];
for(int i=0; i<%(view_ndim)s; i++)
dims[i] = xview_dims[i];
zview = pygpu_fromgpudata(%(x)s->ga.data,
xview_offset,
%(x)s->ga.typecode,
%(view_ndim)s,
dims,
xview_strides,
pygpu_default_context(),
1,
(PyObject *)%(x)s,
(PyObject *)&PyGpuArrayType);
""" % locals()
return ret
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return {'c_prefix': 'PyGpuArray',
'strides_mul': 1
}
def copy_into(self, view, source):
"""
view: string, C code expression for an array
source: string, C code expression for an array
returns a C code expression to copy source into view, and
return 0 on success
"""
return """GpuArray_move(&%(view)s->ga, &%(source)s->ga)""" % locals()
def c_support_code_apply(self, node, nodename):
gop = self.iadd_node.op
sub_name = nodename + "_add_to_zview"
ret = gop.c_support_code_apply(self.iadd_node, sub_name)
ret += """
PyGpuArrayObject* inc_sub_iadd_%(nodename)s(PyGpuArrayObject* dst,
PyGpuArrayObject* src){
PyGpuArrayObject* ret = NULL;
""" % locals()
#def c_code(self, node, name, inputs, outputs, sub):
inputs = ["dst", "src"]
outputs = ["ret"]
sub = {"fail": "return NULL;"}
ret += gop.c_code(self.iadd_node, sub_name, inputs, outputs, sub)
ret += """
return dst;
}
"""
return ret
def add_to_zview(self, nodename, x, fail):
#TODO
return """
PyGpuArrayObject * add_result = inc_sub_iadd_%(nodename)s(zview, %(x)s);
if (! add_result )
{
Py_DECREF(zview);
%(fail)s;
}
else
{
Py_DECREF(add_result);
}
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
elemwise_version = self.iadd_node.c_code_cache_version()
if not parent_version or not elemwise_version:
return
return parent_version + elemwise_version + (0,)
from theano.tensor.tests.test_subtensor import T_subtensor from theano.tensor.tests.test_subtensor import T_subtensor
from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost) from theano.sandbox.gpuarray.basic_ops import (HostFromGpu, GpuFromHost)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import gpuarray_shared_constructor from theano.sandbox.gpuarray.type import gpuarray_shared_constructor
...@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp ...@@ -11,6 +11,7 @@ from theano.compile import DeepCopyOp
from theano import tensor from theano import tensor
class G_subtensor(T_subtensor): class G_subtensor(T_subtensor):
def shortDescription(self): def shortDescription(self):
return None return None
...@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor): ...@@ -19,8 +20,10 @@ class G_subtensor(T_subtensor):
T_subtensor.__init__(self, name, T_subtensor.__init__(self, name,
shared=gpuarray_shared_constructor, shared=gpuarray_shared_constructor,
sub=GpuSubtensor, sub=GpuSubtensor,
inc_sub=GpuIncSubtensor,
mode=mode_with_gpu, mode=mode_with_gpu,
# avoid errors with limited devices # avoid errors with limited devices
dtype='float32', dtype='float32',
ignore_topo=(HostFromGpu,GpuFromHost,DeepCopyOp)) ignore_topo=(HostFromGpu, GpuFromHost,
DeepCopyOp))
assert self.sub == GpuSubtensor assert self.sub == GpuSubtensor
...@@ -1255,7 +1255,7 @@ class IncSubtensor(Op): ...@@ -1255,7 +1255,7 @@ class IncSubtensor(Op):
copy_into = self.copy_into("zview", y) copy_into = self.copy_into("zview", y)
add_to_zview = self.add_to_zview(y, fail) add_to_zview = self.add_to_zview(name, y, fail)
make_modification = """ make_modification = """
if (%(op_is_set)s) if (%(op_is_set)s)
...@@ -1353,7 +1353,7 @@ class IncSubtensor(Op): ...@@ -1353,7 +1353,7 @@ class IncSubtensor(Op):
""" """
return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals() return """PyArray_CopyInto(%(view)s, %(source)s)""" % locals()
def add_to_zview(self, x, fail): def add_to_zview(self, name, x, fail):
""" Return C code to add x to zview. Should DECREF zview if the """ Return C code to add x to zview. Should DECREF zview if the
add fails.""" add fails."""
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论