提交 9950ce08 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #1580 from nouiz/deprecated

[MRG]Deprecated
global-include *.txt
global-include *.c
global-include *.cu
global-include *.cuh
global-include *.sh
......
......@@ -67,9 +67,9 @@ you should check the strides and alignment.
if (!%(y)s)
%(fail)s;
{//New scope needed to make compilation work
dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data;
dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data;
for (int i = 2; i < %(x)s->dimensions[0]; ++i)
dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
y[i] = y[i-1]*y[i-2] + x[i];
}
""" % locals()
......
......@@ -420,7 +420,9 @@ TensorVariable
.. class:: _tensor_py_operators(object)
This mix-in class adds convenient attributes, methods, and support for Python operators (see :ref:`tensor_operator_support`).
This mix-in class adds convenient attributes, methods, and support
to TensorVariable, TensorConstant and TensorSharedVariable for
Python operators (see :ref:`tensor_operator_support`).
.. attribute:: type
......@@ -472,6 +474,10 @@ TensorVariable
See :func:`flatten`.
.. method:: ravel()
return self.flatten(). For NumPy compatibility.
.. attribute:: T
Transpose of this tensor.
......@@ -485,8 +491,31 @@ TensorVariable
same vector! Use `reshape` or `dimshuffle` to turn your vector
into a row or column matrix.
.. method:: {any,all}(axis=None, keepdims=False)
.. method:: {sum,prod,mean}(axis=None, dtype=None, keepdims=False, acc_dtype=None)
.. method:: {var,std,min,max,argmin,argmax}(axis=None, keepdims=False),
.. method:: diagonal(offset=0, axis1=0, axis2=1)
.. method:: astype(dtype)
.. method:: take(indices, axis=None, mode='raise')
.. method:: copy()
.. method:: norm(L, axis=None)
.. method:: nonzero(self, return_matrix=False)
.. method:: nonzero_values(self)
.. method:: sort(self, axis=-1, kind='quicksort', order=None)
.. method:: argsort(self, axis=-1, kind='quicksort', order=None)
.. method:: clip(self, a_min, a_max)
.. method:: conf()
.. method:: repeat(repeats, axis=None)
.. method:: round(mode="half_away_from_zero")
.. method:: trace()
.. method:: get_scalar_constant_value()
.. method:: zeros_like(model, dtype=None)
All the above methods are equivalent to NumPy for Theano on the current tensor.
.. method:: __{abs,neg,lt,le,gt,ge,invert,and,or,add,sub,mul,div,truediv,floordiv}__
Those elemwise operation are supported via Python syntax.
Shaping and Shuffling
=====================
......
......@@ -155,11 +155,11 @@ class WeirdBrokenOp(gof.Op):
prep_vars = """
//the output array has size M x N
npy_intp M = PyArray_DIMS(%(a)s)[0];
npy_intp Sa = %(a)s->strides[0] / PyArray_DESCR(%(a)s)->elsize;
npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize;
npy_intp Sa = PyArray_STRIDES(%(a)s)[0] / PyArray_DESCR(%(a)s)->elsize;
npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
npy_double * Da = (npy_double*)%(a)s->data;
npy_double * Dz = (npy_double*)%(z)s->data;
npy_double * Da = (npy_double*)PyArray_BYTES(%(a)s);
npy_double * Dz = (npy_double*)PyArray_BYTES(%(z)s);
//clear the output array
for (npy_intp m = 0; m < M; ++m)
......
......@@ -1693,7 +1693,7 @@ class GCC_compiler(object):
#to use the new API, but not everywhere. When finished, enable
#the following macro to assert that we don't bring new code
#that use the old API.
#cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
# numpy 1.7 deprecated the following macro but the new one didn't
......
......@@ -76,10 +76,7 @@ except ImportError:
except ImportError:
_logger.info("Compiling new CVM")
dirname = 'lazylinker_ext'
# We use a .txt extensions as otherwise it don't get
# included when we create a package to send to pypi
# This happen even if we tell to include *.c files
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c.txt')
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
code = open(cfile).read()
loc = os.path.join(config.compiledir, dirname)
if not os.path.exists(loc):
......
......@@ -220,6 +220,7 @@ if __name__ == "__main__":
GTX 650 Ti 0.27s
GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GTX 550 Ti 0.57s
GT 520 2.68s 3.06s
520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
......
......@@ -2223,12 +2223,6 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp))
# C Code shared by GpuSubtensor and GpuIncSubtensor
_define_set_data = """
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
"""
class GpuSubtensor(GpuOp, tensor.Subtensor):
"""
Implement subtensor on the gpu.
......@@ -2276,16 +2270,27 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
view_ndim = node.outputs[0].ndim
fail = sub['fail']
decl = "CudaNdarray* xview = NULL;"
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
view_ndim=view_ndim,
c_prefix='CudaNdarray',
strides_mul=4,
)
build_view = """
//TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure
CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
if (!xview)
{
%(fail)s;
}
if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
if (CudaNdarray_set_device_data(
xview,
CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
(PyObject*) %(x)s))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
......@@ -2294,43 +2299,24 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
%(fail)s;
}
cnda_mark_dev_structure_dirty(xview);
""" % locals()
get_xview = _define_set_data + \
self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
c_prefix='CudaNdarray',
set_data='CudaNdarray_set_device_data2',
set_dim='CudaNdarray_set_dim',
set_stride='CudaNdarray_set_stride',
update_flags="", strides_mul=4)
finish_view = ""
#For broadcasted dimensions, set the strides to 0
#We can't do that only for broadcasted dimensions as this can happen for dimensions of size 0,
#That are rebroadcated later.
for idx in range(node.outputs[0].ndim):
finish_view += """
if(CudaNdarray_HOST_DIMS(xview)[%(idx)s]==1)
CudaNdarray_set_stride(xview, %(idx)s, 0);
""" % locals()
finish_view += """
//Set the base only now
if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(xview);
%(fail)s;
for(int idx=0;idx <%(view_ndim)s; idx++){
//For broadcasted dimensions, set the strides to 0
//We can't do that only for broadcasted dimensions as this can happen
//for dimensions of size 0. That are rebroadcated later.
if(xview_dims[idx]==1)
CudaNdarray_set_stride(xview, idx, 0);
else
CudaNdarray_set_stride(xview, idx, xview_strides[idx]);
CudaNdarray_set_dim(xview, idx, xview_dims[idx]);
}
""" % locals()
finish_view = """
Py_XDECREF(%(z)s);
%(z)s = xview;
""" % locals()
return build_view + "{" + get_xview + "}" + finish_view
return decl + get_xview + build_view + finish_view
def c_code_cache_version(self):
hv = self.helper_c_code_cache_version()
......@@ -2719,6 +2705,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
""" %locals()
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
Implement IncSubtensor on the gpu.
......@@ -2756,6 +2743,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
def decl_view(self):
return "CudaNdarray* zview = NULL;"
def make_view_array(self, x, view_ndim):
"""
:param x: a string identifying an array to be viewed
......@@ -2765,17 +2755,32 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
This doesn't need to actually set up the view with the
right indexing; we'll do that manually later.
"""
return """CudaNdarray* zview = (CudaNdarray*)
CudaNdarray_New(%(view_ndim)s)""" % locals()
ret = """zview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
if (CudaNdarray_set_device_data(
zview,
CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
(PyObject*) %(x)s))
{
zview = NULL;
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
}else{
cnda_mark_dev_structure_dirty(zview);
for(int idx=0;idx <%(view_ndim)s; idx++){
if(xview_dims[idx]==1)
CudaNdarray_set_stride(zview, idx, 0);
else
CudaNdarray_set_stride(zview, idx, xview_strides[idx]);
CudaNdarray_set_dim(zview, idx, xview_dims[idx]);
}
}
""" % locals()
return ret
def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code"""
return { 'update_flags' : "",
'c_prefix' : 'CudaNdarray',
'set_data' :'CudaNdarray_set_device_data2',
'set_dim' : 'CudaNdarray_set_dim',
'set_stride' : 'CudaNdarray_set_stride',
'update_flags' : "",
return {'c_prefix': 'CudaNdarray',
'strides_mul': 4
}
......@@ -2789,24 +2794,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
"""
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
return _define_set_data
def link_view_array(self, x, fail):
return """
if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(zview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(zview);
""" % locals()
def set_view_base(self, x, fail):
return """
//Set the base only now
......@@ -2823,7 +2810,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
def add_to_zview(self, x, fail):
return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
(PyObject *) py_%(x)s);
......@@ -2839,7 +2825,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" % locals()
def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version:
return parent_version + (0,)
......
......@@ -5,13 +5,14 @@ Generator code in SSJ package (L'Ecuyer & Simard)
http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
"""
import sys, warnings
import warnings
import numpy
from theano import Op, Apply, shared, config, Variable
from theano.tensor import (raw_random, TensorType, as_tensor_variable,
get_vector_length, cast, opt, scal)
from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod
from theano.tensor import sqrt, log, sin, cos, join, prod
from theano.compile import optdb
from theano.gof import local_optimizer
from theano.gof.python25 import all, any
......@@ -36,6 +37,7 @@ def matVecModM(A, s, m):
x[i] = r + m
return x
def multMatVect(v, A, m1, B, m2):
#multiply the first half of v by A with a modulo of m1
#and the second half by B with a modulo of m2
......@@ -79,9 +81,11 @@ A2p134 = numpy.asarray(
[1401213391, 1178684362, 1431130166]])
np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
def ff_2p134(rstate):
return multMatVect(rstate, A1p134, M1, A2p134, M2)
def ff_2p72(rstate):
return multMatVect(rstate, A1p72, M1, A2p72, M2)
......@@ -93,8 +97,8 @@ def mrg_next_value(rstate, new_rstate):
#i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
#first component
y1 = (((x12 & MASK12) << i22) + (x12 >> i9)
+ ((x13 & MASK13) << i7) + (x13 >> i24))
y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
((x13 & MASK13) << i7) + (x13 >> i24))
assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M1): #must also check overflow
......@@ -135,6 +139,7 @@ def mrg_next_value(rstate, new_rstate):
else:
return (x11 - x21) * NORM
class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False):
Op.__init__(self)
......@@ -145,17 +150,19 @@ class mrg_uniform_base(Op):
self.warned_numpy_version = False
def __eq__(self, other):
return type(self) == type(other) \
and self.output_type == other.output_type \
and self.inplace == other.inplace
return (type(self) == type(other) and
self.output_type == other.output_type and
self.inplace == other.inplace)
def __hash__(self):
return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
def __str__(self):
if self.inplace:
s = "inplace"
else: s = "no_inplace"
return self.__class__.__name__+"{%s,%s}"%(self.output_type,s)
else:
s = "no_inplace"
return self.__class__.__name__ + "{%s,%s}" % (self.output_type, s)
def make_node(self, rstate, size):
# error checking slightly redundant here, since
......@@ -166,7 +173,7 @@ class mrg_uniform_base(Op):
[rstate, size],
[rstate.type(), self.output_type()])
def grad(self,inputs,ograd):
def grad(self, inputs, ograd):
return [None for i in inputs]
def R_op(self, inputs, eval_points):
......@@ -187,8 +194,8 @@ class mrg_uniform(mrg_uniform_base):
def perform(self, node, inp, out):
rstate, size = inp
o_rstate, o_sample = out
numpy_version=numpy.__version__.split('.')
if not self.warned_numpy_version and int(numpy_version[0])<=1 and int(numpy_version[1])<3:
numpy_version = numpy.__version__.split('.')
if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 :
print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
self.warned_numpy_version = True
......@@ -201,20 +208,21 @@ class mrg_uniform(mrg_uniform_base):
for s in size:
n_elements *= s
n_streams,_ = rstate.shape
n_streams, _ = rstate.shape
rval = numpy.zeros(n_elements, dtype=self.output_type.dtype)
err_orig = numpy.seterr(over='ignore')
try:
for i in xrange(n_elements):
sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams])
sample = mrg_next_value(rstate[i % n_streams],
rstate[i % n_streams])
rval[i] = sample
finally:
numpy.seterr(**err_orig)
o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size)) # send to GPU if necessary
def c_code(self, node, name, inp, out, sub):
rstate, size = inp
......@@ -228,7 +236,7 @@ class mrg_uniform(mrg_uniform_base):
fail = sub['fail']
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
else:
......@@ -279,7 +287,7 @@ class mrg_uniform(mrg_uniform_base):
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
......@@ -313,8 +321,8 @@ class mrg_uniform(mrg_uniform_base):
}
n_streams = PyArray_DIMS(%(o_rstate)s)[0];
sample_data = (%(otype)s *) %(o_sample)s->data;
state_data = (npy_int32 *) %(o_rstate)s->data;
sample_data = (%(otype)s *) PyArray_DATA(%(o_sample)s);
state_data = (npy_int32 *) PyArray_DATA(%(o_rstate)s);
for (int i = 0; i < n_elements; ++i)
{
npy_int32 * state_data_i = state_data + (i%%n_streams)*6;
......@@ -392,7 +400,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32':
otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65))
NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0
else:
......@@ -476,7 +484,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
}
}
""" %locals()
""" % locals()
def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp
......@@ -491,7 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
else:
otype = 'double'
SYNC="CNDA_THREAD_SYNC";
SYNC = "CNDA_THREAD_SYNC"
return """
//////// <code generated by mrg_uniform>
......@@ -521,7 +529,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
}
for (int i = 0; i < %(ndim)s; ++i)
{
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0];
odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
......@@ -593,7 +601,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
}
//////// </ code generated by mrg_uniform>
""" %locals()
""" % locals()
def c_code_cache_version(self):
return (7,)
......@@ -662,7 +671,7 @@ class MRG_RandomStreams(object):
elif seed >= M2:
raise ValueError('seed should be less than %i' % M2, seed)
self.rstate = numpy.asarray([seed]*6, dtype='int32')
elif len(seed)==6:
elif len(seed) == 6:
if seed[0] == 0 and seed[1] == 0 and seed[2] == 0:
raise ValueError('The first 3 values of seed should not be all 0', seed)
if seed[3] == 0 and seed[4] == 0 and seed[5] == 0:
......@@ -690,7 +699,7 @@ class MRG_RandomStreams(object):
"""
assert n_streams < 2**72
assert n_streams > 0
rval = numpy.zeros((n_streams,6), dtype='int32')
rval = numpy.zeros((n_streams, 6), dtype='int32')
rval[0] = self.rstate
for i in xrange(1, n_streams):
rval[i] = ff_2p72(rval[i - 1])
......@@ -776,11 +785,13 @@ class MRG_RandomStreams(object):
# currently no Theano node that will do a frombuffer
# reinterpretation.
u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate, ndim, dtype, size))
*GPU_mrg_uniform.new(node_rstate,
ndim, dtype, size))
else:
node_rstate = shared(self.get_substream_rstates(nstreams))
u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate, ndim, dtype, size))
*mrg_uniform.new(node_rstate,
ndim, dtype, size))
r = u * (high - low) + low
if u.type.broadcastable != r.type.broadcastable:
......@@ -934,4 +945,6 @@ def mrg_random_make_inplace(node):
new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs
return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace')
optdb.register('random_make_inplace_mrg',
opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
99, 'fast_run', 'inplace')
This source diff could not be displayed because it is too large. You can view the blob instead.
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论