提交 c8934e50 authored 作者: lamblin's avatar lamblin

Merge pull request #939 from nouiz/c_code_gpusub

C code gpusub
...@@ -403,7 +403,7 @@ class TestEquilibrium(object): ...@@ -403,7 +403,7 @@ class TestEquilibrium(object):
PatternSub((op4, 'x', 'y'), (op1, 'x', 'y')), PatternSub((op4, 'x', 'y'), (op1, 'x', 'y')),
PatternSub((op3, (op2, 'x', 'y')), (op4, 'x', 'y')) PatternSub((op3, (op2, 'x', 'y')), (op4, 'x', 'y'))
], ],
max_use_ratio = 1. / len(g.nodes)) # each opt can only be applied once max_use_ratio = 1. / len(g.apply_nodes)) # each opt can only be applied once
opt.optimize(g) opt.optimize(g)
finally: finally:
_logger.setLevel(oldlevel) _logger.setLevel(oldlevel)
......
...@@ -544,12 +544,14 @@ def pydotprint(fct, outfile=None, ...@@ -544,12 +544,14 @@ def pydotprint(fct, outfile=None,
if isinstance(fct, Function): if isinstance(fct, Function):
mode = fct.maker.mode mode = fct.maker.mode
fct_fgraph = fct.maker.fgraph profile = getattr(fct, "profile", None)
if (not isinstance(mode, ProfileMode) if (not isinstance(mode, ProfileMode)
or not fct in mode.profile_stats): or not fct in mode.profile_stats):
mode = None mode = None
fct_fgraph = fct.maker.fgraph
elif isinstance(fct, gof.FunctionGraph): elif isinstance(fct, gof.FunctionGraph):
mode = None mode = None
profile = None
fct_fgraph = fct fct_fgraph = fct
else: else:
raise ValueError(('pydotprint expects as input a theano.function or ' raise ValueError(('pydotprint expects as input a theano.function or '
...@@ -660,6 +662,14 @@ def pydotprint(fct, outfile=None, ...@@ -660,6 +662,14 @@ def pydotprint(fct, outfile=None,
else: else:
pf = time * 100 / mode.profile_stats[fct].fct_call_time pf = time * 100 / mode.profile_stats[fct].fct_call_time
prof_str = ' (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf) prof_str = ' (%.3fs,%.3f%%,%.3f%%)' % (time, pt, pf)
elif profile:
time = profile.apply_time.get(node, 0)
#second, %fct time in profiler
if profile.fct_callcount == 0:
pf = 0
else:
pf = time * 100 / profile.fct_call_time
prof_str = ' (%.3fs,%.3f%%)' % (time, pf)
applystr = str(node.op).replace(':', '_') applystr = str(node.op).replace(':', '_')
applystr += prof_str applystr += prof_str
if (applystr in all_strings) or with_ids: if (applystr in all_strings) or with_ids:
......
...@@ -1911,7 +1911,7 @@ class GpuReshape(tensor.Reshape, GpuOp): ...@@ -1911,7 +1911,7 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
class GpuSubtensor(tensor.Subtensor, GpuOp): class GpuSubtensor(GpuOp, tensor.Subtensor):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
""" """
...@@ -1920,19 +1920,16 @@ class GpuSubtensor(tensor.Subtensor, GpuOp): ...@@ -1920,19 +1920,16 @@ class GpuSubtensor(tensor.Subtensor, GpuOp):
assert isinstance(x.type, CudaNdarrayType) assert isinstance(x.type, CudaNdarrayType)
rval = tensor.Subtensor.make_node(self, x, *inputs) rval = tensor.Subtensor.make_node(self, x, *inputs)
otype = CudaNdarrayType(rval.outputs[0].type.broadcastable) otype = CudaNdarrayType(rval.outputs[0].type.broadcastable)
#We reverse the index here as a speed optimization return Apply(self, [x] + rval.inputs[1:], [otype()])
#this opt was saving 0.40e-05s of 3.49e05s
return Apply(self, [x] + list(reversed(rval.inputs[1:])), [otype()])
def perform(self, node, inputs, out_): def perform(self, node, inputs, out_):
out, = out_ out, = out_
x = inputs[0] x = inputs[0]
indices = inputs[1:] indices = list(reversed(inputs[1:]))
def convert(entry): def convert(entry):
if isinstance(entry, Type): if isinstance(entry, Type):
rval = indices.pop() rval = indices.pop()
#the if take about .25e-05s
if sys.version_info < (2, 5): if sys.version_info < (2, 5):
# Before Python 2.5, PySlice_GetIndicesEx requires # Before Python 2.5, PySlice_GetIndicesEx requires
# Python int to be passed. # Python int to be passed.
...@@ -1955,6 +1952,59 @@ class GpuSubtensor(tensor.Subtensor, GpuOp): ...@@ -1955,6 +1952,59 @@ class GpuSubtensor(tensor.Subtensor, GpuOp):
cdata = cdata[0] cdata = cdata[0]
out[0] = x.__getitem__(cdata) out[0] = x.__getitem__(cdata)
def c_code(self, node, name, inputs, outputs, sub):
x = inputs[0]
z, = outputs
view_ndim = node.outputs[0].ndim
fail = sub['fail']
build_view = """
//TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure
CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
if (!xview)
{
%(fail)s;
}
if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(xview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(xview);
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
""" % locals()
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
c_prefix='CudaNdarray',
set_data='CudaNdarray_set_device_data2',
set_dim='CudaNdarray_set_dim',
set_stride='CudaNdarray_set_stride',
update_flags="", strides_mul=4)
finish_view = """
//Set the base only now
if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(xview);
%(fail)s;
}
Py_XDECREF(%(z)s);
%(z)s = xview;
""" % locals()
return build_view + "{" + get_xview + "}" + finish_view
class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp): class GpuAdvancedSubtensor1(tensor.AdvancedSubtensor1, GpuOp):
""" """
......
...@@ -161,6 +161,14 @@ DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self); ...@@ -161,6 +161,14 @@ DllExport const int *CudaNdarray_DEV_STRIDES(const CudaNdarray * self);
DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self); DllExport const int *CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self);
DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self); DllExport float *CudaNdarray_DEV_DATA(const CudaNdarray * self);
// The following 4 macro are here to help make c code generator that work on
// both PyArray and CudaNdarray. This is at least used for Subtensor and
// GpuSubtensor
#define CudaNdarray_DIMS CudaNdarray_HOST_DIMS
#define CudaNdarray_NDIM(self) self->nd
#define CudaNdarray_STRIDES CudaNdarray_HOST_STRIDES
#define CudaNdarray_BYTES CudaNdarray_DEV_DATA
/** /**
* Return the number of elements in the ndarray (product of the dimensions) * Return the number of elements in the ndarray (product of the dimensions)
*/ */
......
...@@ -7,6 +7,8 @@ import subprocess ...@@ -7,6 +7,8 @@ import subprocess
import sys import sys
import warnings import warnings
import numpy
import theano import theano
from theano.gof.cc import hash_from_file from theano.gof.cc import hash_from_file
from theano.gof.cmodule import (std_libs, std_lib_dirs, from theano.gof.cmodule import (std_libs, std_lib_dirs,
...@@ -121,6 +123,17 @@ class NVCC_compiler(object): ...@@ -121,6 +123,17 @@ class NVCC_compiler(object):
os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh')) os.path.join(os.path.split(__file__)[0], 'cuda_ndarray.cuh'))
flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash) flags.append('-DCUDA_NDARRAY_CUH=' + cuda_ndarray_cuh_hash)
# numpy 1.7 deprecated the following macro but the didn't
# existed in the past
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
if bool(numpy_ver < [1, 7]):
flags.append("-D NPY_ARRAY_ENSURECOPY=NPY_ENSURECOPY")
flags.append("-D NPY_ARRAY_ALIGNED=NPY_ALIGNED")
flags.append("-D NPY_ARRAY_WRITEABLE=NPY_WRITEABLE")
flags.append("-D NPY_ARRAY_UPDATE_ALL=NPY_UPDATE_ALL")
flags.append("-D NPY_ARRAY_C_CONTIGUOUS=NPY_C_CONTIGUOUS")
flags.append("-D NPY_ARRAY_F_CONTIGUOUS=NPY_F_CONTIGUOUS")
# We compile cuda_ndarray.cu during import. # We compile cuda_ndarray.cu during import.
# We should not add device properties at that time. # We should not add device properties at that time.
# As the device is not selected yet! # As the device is not selected yet!
......
...@@ -3952,9 +3952,21 @@ class Subtensor(Op): ...@@ -3952,9 +3952,21 @@ class Subtensor(Op):
return "%s{%s}" % (self.__class__.__name__, ", ".join(indices)) return "%s{%s}" % (self.__class__.__name__, ", ".join(indices))
@staticmethod @staticmethod
def helper_c_code(node, name, inputs, outputs, sub, idx_list): def helper_c_code(node, name, inputs, outputs, sub, idx_list,
if not isinstance(node.inputs[0].type, TensorType): c_prefix="PyArray",
raise NotImplementedError() update_flags=("PyArray_UpdateFlags(xview,"
" NPY_ARRAY_C_CONTIGUOUS|"
"NPY_ARRAY_F_CONTIGUOUS);"),
set_data='PyArray_set_data',
set_dim='PyArray_set_dim',
set_stride='PyArray_set_stride',
strides_mul=1,
):
"""The parameters c_prefix, update_flags, set_data, set_dim,
set_stride and strides_mul are there to allow reusing this
function on PyArray and CudaNdarray object.
"""
# #
# two arrays are created in C code: # two arrays are created in C code:
# is_slice: len == ndim, 0 means int, 1 means slice # is_slice: len == ndim, 0 means int, 1 means slice
...@@ -4019,7 +4031,6 @@ class Subtensor(Op): ...@@ -4019,7 +4031,6 @@ class Subtensor(Op):
assert len(is_slice) <= node.inputs[0].ndim, node.inputs[0].ndim assert len(is_slice) <= node.inputs[0].ndim, node.inputs[0].ndim
len_is_slice = len(is_slice) len_is_slice = len(is_slice)
view_ndim = node.inputs[0].ndim - (numpy.asarray(is_slice) == 0).sum()
len_subtensor_spec = spec_pos() len_subtensor_spec = spec_pos()
...@@ -4030,6 +4041,10 @@ class Subtensor(Op): ...@@ -4030,6 +4041,10 @@ class Subtensor(Op):
z, = outputs z, = outputs
rval = """ rval = """
#define PyArray_set_dim(obj, idx, d) PyArray_DIMS(obj)[idx]=d
#define PyArray_set_stride(obj, idx, d) PyArray_STRIDES(obj)[idx]=d
#define PyArray_set_data(obj, ptr, base) PyArray_BYTES(obj)=ptr
// The subtensor is created by iterating over the dimensions // The subtensor is created by iterating over the dimensions
// and updating stride, shape, and data pointers // and updating stride, shape, and data pointers
...@@ -4040,39 +4055,30 @@ class Subtensor(Op): ...@@ -4040,39 +4055,30 @@ class Subtensor(Op):
int inner_ii = 0; // the current dimension of zview int inner_ii = 0; // the current dimension of zview
int outer_ii = 0; // current dimension of z int outer_ii = 0; // current dimension of z
//TODO: give this Op a second output so that this view can be cached char* ptr = (char*) %(c_prefix)s_BYTES(xview);
//TODO: alternatively, fix the memory leak on failure
Py_INCREF(PyArray_DESCR(%(x)s));
PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(x)s),
%(view_ndim)s,
PyArray_DIMS(%(x)s),
PyArray_STRIDES(%(x)s),
PyArray_DATA(%(x)s),
%(x)s->flags,
NULL);
if (!xview)
{
%(fail)s;
}
if ((PyArray_DIMS(xview) == PyArray_DIMS(%(x)s)) if ((%(c_prefix)s_DIMS(xview) == %(c_prefix)s_DIMS(%(x)s))
&& (PyArray_DIMS(%(x)s) != NULL)) && (%(c_prefix)s_DIMS(%(x)s) != NULL))
{ {
PyErr_Format(PyExc_ValueError, "x and xview" PyErr_Format(PyExc_ValueError, "x and xview"
"(with %%d dims) have the same dimensions" "(with %%d dims) have the same dimensions"
" pointers: %%p and %%p", " pointers: %%p and %%p",
PyArray_NDIM(%(x)s), PyArray_DIMS(xview), PyArray_DIMS(%(x)s)); %(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_DIMS(xview),
%(c_prefix)s_DIMS(%(x)s));
Py_XDECREF(xview);
%(fail)s; %(fail)s;
} }
if (PyArray_STRIDES(xview) == PyArray_STRIDES(%(x)s) if (%(c_prefix)s_STRIDES(xview) == %(c_prefix)s_STRIDES(%(x)s)
&& (PyArray_DIMS(%(x)s) != NULL)) && (%(c_prefix)s_DIMS(%(x)s) != NULL))
{ {
PyErr_Format(PyExc_ValueError, "x and xview" PyErr_Format(PyExc_ValueError, "x and xview"
"(with %%d dims) have the same strides" "(with %%d dims) have the same strides"
" pointers: %%p and %%p", " pointers: %%p and %%p",
PyArray_NDIM(%(x)s), PyArray_STRIDES(xview), PyArray_STRIDES(%(x)s)); %(c_prefix)s_NDIM(%(x)s),
%(c_prefix)s_STRIDES(xview),
%(c_prefix)s_STRIDES(%(x)s));
Py_XDECREF(xview);
%(fail)s; %(fail)s;
} }
...@@ -4080,7 +4086,7 @@ class Subtensor(Op): ...@@ -4080,7 +4086,7 @@ class Subtensor(Op):
{ {
if (is_slice[outer_ii]) if (is_slice[outer_ii])
{ {
npy_intp length = PyArray_DIMS(%(x)s)[outer_ii]; npy_intp length = %(c_prefix)s_DIMS(%(x)s)[outer_ii];
npy_intp slicelength; npy_intp slicelength;
npy_intp start = subtensor_spec[spec_pos+0]; npy_intp start = subtensor_spec[spec_pos+0];
npy_intp stop = subtensor_spec[spec_pos+1]; npy_intp stop = subtensor_spec[spec_pos+1];
...@@ -4097,6 +4103,7 @@ class Subtensor(Op): ...@@ -4097,6 +4103,7 @@ class Subtensor(Op):
Py_DECREF(xview); Py_DECREF(xview);
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"slice step cannot be zero"); "slice step cannot be zero");
Py_XDECREF(xview);
%(fail)s; %(fail)s;
} }
...@@ -4144,9 +4151,12 @@ class Subtensor(Op): ...@@ -4144,9 +4151,12 @@ class Subtensor(Op):
} }
assert (slicelength <= length); assert (slicelength <= length);
xview->data += PyArray_STRIDES(%(x)s)[outer_ii] * start;
PyArray_DIMS(xview)[inner_ii] = slicelength; ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * start *
PyArray_STRIDES(xview)[inner_ii] = PyArray_STRIDES(%(x)s)[outer_ii] * step; %(strides_mul)s;
%(set_dim)s(xview, inner_ii, slicelength);
%(set_stride)s(xview, inner_ii,
%(c_prefix)s_STRIDES(%(x)s)[outer_ii] * step);
inner_ii += 1; inner_ii += 1;
spec_pos += 3; spec_pos += 3;
...@@ -4154,53 +4164,81 @@ class Subtensor(Op): ...@@ -4154,53 +4164,81 @@ class Subtensor(Op):
else // tuple coord `outer_ii` is an int else // tuple coord `outer_ii` is an int
{ {
int idx = subtensor_spec[spec_pos]; int idx = subtensor_spec[spec_pos];
if (idx < 0) idx += PyArray_DIMS(%(x)s)[outer_ii]; if (idx < 0) idx += %(c_prefix)s_DIMS(%(x)s)[outer_ii];
if (idx >= 0) if (idx >= 0)
{ {
if (idx < PyArray_DIMS(%(x)s)[outer_ii]) if (idx < %(c_prefix)s_DIMS(%(x)s)[outer_ii])
{ {
xview->data += PyArray_STRIDES(%(x)s)[outer_ii] * idx; ptr += %(c_prefix)s_STRIDES(%(x)s)[outer_ii] * idx *
%(strides_mul)s;
} }
else else
{ {
PyErr_Format(PyExc_IndexError,"index out of bounds"); PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview);
%(fail)s; %(fail)s;
} }
} }
else else
{ {
PyErr_Format(PyExc_IndexError,"index out of bounds"); PyErr_Format(PyExc_IndexError,"index out of bounds");
Py_XDECREF(xview);
%(fail)s; %(fail)s;
} }
spec_pos += 1; spec_pos += 1;
} }
} }
assert (inner_ii <= PyArray_NDIM(xview)); %(set_data)s(xview, ptr, (PyObject*)NULL);
while (inner_ii < PyArray_NDIM(xview)) assert (inner_ii <= %(c_prefix)s_NDIM(xview));
while (inner_ii < %(c_prefix)s_NDIM(xview))
{ {
assert (outer_ii < PyArray_NDIM(%(x)s)); assert (outer_ii < %(c_prefix)s_NDIM(%(x)s));
PyArray_DIMS(xview)[inner_ii] = PyArray_DIMS(%(x)s)[outer_ii]; %(set_dim)s(xview, inner_ii, %(c_prefix)s_DIMS(%(x)s)[outer_ii]);
PyArray_STRIDES(xview)[inner_ii] = PyArray_STRIDES(%(x)s)[outer_ii]; %(set_stride)s(xview, inner_ii, %(c_prefix)s_STRIDES(%(x)s)[outer_ii]);
inner_ii += 1; inner_ii += 1;
outer_ii += 1; outer_ii += 1;
} }
PyArray_UpdateFlags(xview, NPY_ARRAY_C_CONTIGUOUS|NPY_F_CONTIGUOUS); %(update_flags)s
""" % locals() """ % locals()
# print rval # print rval
return rval return rval
@staticmethod @staticmethod
def helper_c_code_cache_version(): def helper_c_code_cache_version():
return (4,) return (5,)
def c_code(self, node, name, inputs, outputs, sub): # DEBUG def c_code(self, node, name, inputs, outputs, sub): # DEBUG
part0 = self.helper_c_code(node, name, inputs, outputs, sub, if not isinstance(node.inputs[0].type, TensorType):
self.idx_list) raise NotImplementedError()
x = inputs[0] x = inputs[0]
z, = outputs z, = outputs
part1 = """ view_ndim = node.outputs[0].ndim
fail = sub['fail']
build_view = """
//TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure
Py_INCREF(PyArray_DESCR(%(x)s));
PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(x)s),
%(view_ndim)s,
PyArray_DIMS(%(x)s),
PyArray_STRIDES(%(x)s),
PyArray_DATA(%(x)s),
%(x)s->flags,
NULL);
if (!xview)
{
%(fail)s;
}
""" % locals()
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list)
finish_view = """
if (%(z)s) Py_DECREF(%(z)s); if (%(z)s) Py_DECREF(%(z)s);
Py_INCREF(py_%(x)s); Py_INCREF(py_%(x)s);
PyArray_BASE(xview) = py_%(x)s; PyArray_BASE(xview) = py_%(x)s;
...@@ -4208,7 +4246,7 @@ class Subtensor(Op): ...@@ -4208,7 +4246,7 @@ class Subtensor(Op):
%(z)s = xview; %(z)s = xview;
""" % locals() """ % locals()
return part0 + part1 return build_view + "{" + get_xview + "}" + finish_view
def c_code_cache_version(self): def c_code_cache_version(self):
hv = self.helper_c_code_cache_version() hv = self.helper_c_code_cache_version()
...@@ -4216,7 +4254,7 @@ class Subtensor(Op): ...@@ -4216,7 +4254,7 @@ class Subtensor(Op):
# have a versioned version of this op's C code. # have a versioned version of this op's C code.
if len(hv) == 0: if len(hv) == 0:
return () return ()
return (1, hv) return (2, hv)
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
# Subtensor is not differentiable wrt to its indices, therefore we # Subtensor is not differentiable wrt to its indices, therefore we
...@@ -4476,6 +4514,8 @@ class IncSubtensor(Op): ...@@ -4476,6 +4514,8 @@ class IncSubtensor(Op):
out[0] = x out[0] = x
def c_code(self, node, name, inputs, outputs, sub): # DEBUG def c_code(self, node, name, inputs, outputs, sub): # DEBUG
if not isinstance(node.inputs[0].type, TensorType):
raise NotImplementedError()
if self.inplace: # convert bool to int if self.inplace: # convert bool to int
inplace = 1 inplace = 1
...@@ -4489,7 +4529,9 @@ class IncSubtensor(Op): ...@@ -4489,7 +4529,9 @@ class IncSubtensor(Op):
else: else:
op_is_set = 0 op_is_set = 0
fail = sub['fail'] fail = sub['fail']
view_ndim = (node.inputs[0].ndim -
numpy.sum([not isinstance(idx, slice)
for idx in self.idx_list]))
copy_input_if_necessary = """ copy_input_if_necessary = """
if (%(inplace)s) if (%(inplace)s)
{ {
...@@ -4508,6 +4550,25 @@ class IncSubtensor(Op): ...@@ -4508,6 +4550,25 @@ class IncSubtensor(Op):
} }
""" % locals() """ % locals()
#Make a first view on the output, as we will write into it.
build_view = """
//TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure
Py_INCREF(PyArray_DESCR(%(z)s));
PyArrayObject * xview = (PyArrayObject*)PyArray_NewFromDescr(
&PyArray_Type,
PyArray_DESCR(%(z)s),
%(view_ndim)s,
PyArray_DIMS(%(z)s),
PyArray_STRIDES(%(z)s),
PyArray_DATA(%(z)s),
%(z)s->flags,
NULL);
if (!xview)
{
%(fail)s;
}
""" % locals()
# make xview actually a view of %(z)s # make xview actually a view of %(z)s
get_xview = Subtensor.helper_c_code(node, name, get_xview = Subtensor.helper_c_code(node, name,
outputs[:1] + inputs[2:], outputs[:1] + inputs[2:],
...@@ -4541,7 +4602,8 @@ class IncSubtensor(Op): ...@@ -4541,7 +4602,8 @@ class IncSubtensor(Op):
""" % locals() """ % locals()
return (copy_input_if_necessary return (copy_input_if_necessary
+ get_xview + build_view
+ "{" + get_xview + "}"
+ make_modification + make_modification
+ "Py_DECREF(xview);" + "Py_DECREF(xview);"
) )
...@@ -5385,7 +5447,7 @@ class Reshape(Op): ...@@ -5385,7 +5447,7 @@ class Reshape(Op):
// -- will err if this will downcast. This could happen if the // -- will err if this will downcast. This could happen if the
// -- user pass an int64 dtype, but npy_intp endup being int32. // -- user pass an int64 dtype, but npy_intp endup being int32.
new_dims[ii] = ((dtype_%(shp)s*)( new_dims[ii] = ((dtype_%(shp)s*)(
PyArray_DATA(%(shp)s) + ii * PyArray_STRIDES(%(shp)s)[0]))[0]; PyArray_BYTES(%(shp)s) + ii * PyArray_STRIDES(%(shp)s)[0]))[0];
} }
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape, %(z)s = (PyArrayObject *) PyArray_Newshape(%(x)s, &newshape,
......
...@@ -159,9 +159,9 @@ class SoftmaxWithBias(gof.Op): ...@@ -159,9 +159,9 @@ class SoftmaxWithBias(gof.Op):
double sum = 0.0; double sum = 0.0;
bool discount_max = false; bool discount_max = false;
const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_DATA(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i); const dtype_%(x)s* __restrict__ x_i = (dtype_%(x)s*)(PyArray_BYTES(%(x)s) + PyArray_STRIDES(%(x)s)[0] * i);
const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_DATA(%(b)s)); const dtype_%(b)s* __restrict__ b_i = (dtype_%(b)s*)(PyArray_BYTES(%(b)s));
dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); dtype_%(sm) s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
""" """
inside_row_loop = """ inside_row_loop = """
...@@ -306,11 +306,11 @@ class SoftmaxGrad(gof.Op): ...@@ -306,11 +306,11 @@ class SoftmaxGrad(gof.Op):
for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i) for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
{ {
const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_DATA(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i); const dtype_%(dy)s* __restrict__ dy_i = (dtype_%(dy)s*) (PyArray_BYTES(%(dy)s) + PyArray_STRIDES(%(dy)s)[0] * i);
npy_intp Sdy = PyArray_STRIDES(%(dy)s)[1]/sizeof(dtype_%(dy)s); npy_intp Sdy = PyArray_STRIDES(%(dy)s)[1]/sizeof(dtype_%(dy)s);
const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*) (PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_DATA(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i); dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*) (PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);
double sum_dy_times_sm = 0.; double sum_dy_times_sm = 0.;
...@@ -825,9 +825,9 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -825,9 +825,9 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
""", """,
begin_row_loop, begin_row_loop,
""" """
const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_DATA(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0]; const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_DATA(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i); dtype_%(nll) s* __restrict__ nll_i = (dtype_%(nll)s*)(PyArray_BYTES(%(nll)s) + PyArray_STRIDES(%(nll)s)[0] * i);
%(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_DATA(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i); %(am_type)s* __restrict__ am_i = (%(am_type)s*) (PyArray_BYTES(%(am)s) + PyArray_STRIDES(%(am)s)[0] * i);
""", """,
inside_row_loop, inside_row_loop,
""" """
...@@ -977,14 +977,14 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op): ...@@ -977,14 +977,14 @@ class CrossentropySoftmax1HotWithBiasDx (gof.Op):
for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i) for (size_t i = 0; i < PyArray_DIMS(%(dx)s)[0]; ++i)
{ {
const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_DATA(%(dnll)s) + PyArray_STRIDES(%(dnll)s)[0] * i))[0]; const dtype_%(dnll)s dnll_i = ((dtype_%(dnll)s*)(PyArray_BYTES(%(dnll)s) + PyArray_STRIDES(%(dnll)s)[0] * i))[0];
const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_DATA(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0]; const %(y_idx_type) s y_i = ((%(y_idx_type)s*)(PyArray_BYTES(%(y_idx)s) + PyArray_STRIDES(%(y_idx)s)[0] * i))[0];
const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_DATA(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i); const dtype_%(sm)s* __restrict__ sm_i = (dtype_%(sm)s*)(PyArray_BYTES(%(sm)s) + PyArray_STRIDES(%(sm)s)[0] * i);
npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s); npy_intp Ssm = PyArray_STRIDES(%(sm)s)[1]/sizeof(dtype_%(sm)s);
dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_DATA(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i); dtype_%(dx) s* __restrict__ dx_i = (dtype_%(dx)s*)(PyArray_BYTES(%(dx)s) + PyArray_STRIDES(%(dx)s)[0] * i);
npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s); npy_intp Sdx = PyArray_STRIDES(%(dx)s)[1]/sizeof(dtype_%(dx)s);
for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j) for (size_t j = 0; j < PyArray_DIMS(%(dx)s)[1]; ++j)
......
...@@ -2565,15 +2565,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2565,15 +2565,15 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.fail() self.fail()
def test1_ok_range_finite(self): def test1_ok_range_finite(self):
n = self.shared(numpy.ones(3, dtype=self.dtype) * 5) n = self.shared(numpy.arange(3, dtype=self.dtype))
t = n[0:2] t = n[0:2]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(tval[1] == 5.0) self.assertTrue((tval == [0, 1]).all())
def test2_ok_range_finite(self): def test2_ok_range_finite(self):
n = self.shared(numpy.ones((3, 4), dtype=self.dtype) * 5) n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((3, 4)))
# Also check negative index # Also check negative index
for idx in [(slice(0, 2), 3), ((slice(0, 2), -1)), (slice(0, 2), -4)]: for idx in [(slice(0, 2), 3), ((slice(0, 2), -1)), (slice(0, 2), -4)]:
t = n[idx] # l]#0:2,3] t = n[idx] # l]#0:2,3]
...@@ -2612,25 +2612,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2612,25 +2612,25 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
def test1_ok_range_infinite(self): def test1_ok_range_infinite(self):
#Subtensor.debug = True #Subtensor.debug = True
n = self.shared(numpy.ones(3, dtype=self.dtype) * 5) n = self.shared(numpy.arange(3, dtype=self.dtype))
t = n[1:] t = n[1:]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(tval[1] == 5.0) self.assertTrue((tval == [1.0, 2.0]).all())
def test1_ok_strided(self): def test1_ok_strided(self):
n = self.shared(numpy.ones(5, dtype=self.dtype) * 5) n = self.shared(numpy.arange(5, dtype=self.dtype))
t = n[1::2] t = n[1::2]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(tval[1] == 5.0) self.assertTrue((tval == [1.0, 3.0]).all())
t = n[0:-1:2] # 0 to 1 from the end stepping by 2 t = n[0:-1:2] # 0 to 1 from the end stepping by 2
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(tval[1] == 5.0) self.assertTrue((tval == [0.0, 2.0]).all())
def test2_err_bounds0(self): def test2_err_bounds0(self):
n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5) n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5)
...@@ -2671,8 +2671,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2671,8 +2671,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
sys.stderr = old_stderr sys.stderr = old_stderr
def test2_ok_elem(self): def test2_ok_elem(self):
n = self.shared(numpy.asarray(range(6), dtype=self.dtype). n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
reshape((2, 3)))
t = n[0, 2] t = n[0, 2]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
...@@ -2680,8 +2679,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2680,8 +2679,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.all(tval == 2)) self.assertTrue(numpy.all(tval == 2))
def test2_ok_row(self): def test2_ok_row(self):
n = self.shared(numpy.asarray(range(6), dtype=self.dtype). n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
reshape((2, 3)))
t = n[1] t = n[1]
self.assertFalse(any(n.type.broadcastable)) self.assertFalse(any(n.type.broadcastable))
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
...@@ -2690,25 +2688,24 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2690,25 +2688,24 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.all(tval == [3, 4, 5])) self.assertTrue(numpy.all(tval == [3, 4, 5]))
def test2_ok_col(self): def test2_ok_col(self):
n = self.shared(numpy.ones((2, 3), dtype=self.dtype) * 5) n = self.shared(numpy.arange(6, dtype=self.dtype).reshape((2, 3)))
t = n[:, 0] t = n[:, 0]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
self.assertFalse(any(n.type.broadcastable)) self.assertFalse(any(n.type.broadcastable))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(numpy.all(tval == 5.0)) self.assertTrue(numpy.all(tval == [0, 3]))
def test2_ok_rows_finite(self): def test2_ok_rows_finite(self):
n = self.shared(numpy.ones((4, 3), dtype=self.dtype) * 5) n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
t = n[1:3, 0] t = n[1:3, 0]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
self.assertTrue(tval.shape == (2,)) self.assertTrue(tval.shape == (2,))
self.assertTrue(numpy.all(tval == 5.0)) self.assertTrue(numpy.all(tval == [3, 6]))
def test2_ok_cols_infinite(self): def test2_ok_cols_infinite(self):
n = self.shared(numpy.asarray(range(12), dtype=self.dtype). n = self.shared(numpy.arange(12, dtype=self.dtype).reshape((4, 3)))
reshape((4, 3)))
t = n[1, 2:] t = n[1, 2:]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
...@@ -2716,8 +2713,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2716,8 +2713,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.all(tval == 5)) self.assertTrue(numpy.all(tval == 5))
def test2_ok_strided(self): def test2_ok_strided(self):
n = self.shared(numpy.asarray(range(20), dtype=self.dtype). n = self.shared(numpy.arange(20, dtype=self.dtype).reshape((4, 5)))
reshape((4, 5)))
t = n[1:4:2, 1:5:2] t = n[1:4:2, 1:5:2]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
...@@ -2725,8 +2721,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2725,8 +2721,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
self.assertTrue(numpy.all(tval == [[6, 8], [16, 18]])) self.assertTrue(numpy.all(tval == [[6, 8], [16, 18]]))
def test3_ok_mat(self): def test3_ok_mat(self):
n = self.shared(numpy.asarray(range(24), dtype=self.dtype). n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
reshape((2, 3, 4)))
t = n[0, 0, 0] t = n[0, 0, 0]
self.assertTrue(isinstance(t.owner.op, Subtensor)) self.assertTrue(isinstance(t.owner.op, Subtensor))
tval = self.eval_output_and_check(t) tval = self.eval_output_and_check(t)
...@@ -2745,8 +2740,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin): ...@@ -2745,8 +2740,7 @@ class T_subtensor(unittest.TestCase, utt.TestOptimizationMixin):
""" """
newaxis = numpy.newaxis newaxis = numpy.newaxis
n = self.shared(numpy.asarray(range(24), dtype=self.dtype). n = self.shared(numpy.arange(24, dtype=self.dtype).reshape((2, 3, 4)))
reshape((2, 3, 4)))
assert n.ndim == 3 assert n.ndim == 3
n4 = n[newaxis, :, :, :] n4 = n[newaxis, :, :, :]
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论