提交 9950ce08 authored 作者: Pascal Lamblin's avatar Pascal Lamblin

Merge pull request #1580 from nouiz/deprecated

[MRG]Deprecated
global-include *.txt global-include *.txt
global-include *.c
global-include *.cu global-include *.cu
global-include *.cuh global-include *.cuh
global-include *.sh global-include *.sh
......
...@@ -67,9 +67,9 @@ you should check the strides and alignment. ...@@ -67,9 +67,9 @@ you should check the strides and alignment.
if (!%(y)s) if (!%(y)s)
%(fail)s; %(fail)s;
{//New scope needed to make compilation work {//New scope needed to make compilation work
dtype_%(y)s * y = (dtype_%(y)s*)%(y)s->data; dtype_%(y)s * y = (dtype_%(y)s*)PyArray_DATA(%(y)s);
dtype_%(x)s * x = (dtype_%(x)s*)%(x)s->data; dtype_%(x)s * x = (dtype_%(x)s*)PyArray_DATA(%(x)s);
for (int i = 2; i < %(x)s->dimensions[0]; ++i) for (int i = 2; i < PyArray_DIMS(%(x)s)[0]; ++i)
y[i] = y[i-1]*y[i-2] + x[i]; y[i] = y[i-1]*y[i-2] + x[i];
} }
""" % locals() """ % locals()
......
...@@ -420,7 +420,9 @@ TensorVariable ...@@ -420,7 +420,9 @@ TensorVariable
.. class:: _tensor_py_operators(object) .. class:: _tensor_py_operators(object)
This mix-in class adds convenient attributes, methods, and support for Python operators (see :ref:`tensor_operator_support`). This mix-in class adds convenient attributes, methods, and support
to TensorVariable, TensorConstant and TensorSharedVariable for
Python operators (see :ref:`tensor_operator_support`).
.. attribute:: type .. attribute:: type
...@@ -472,6 +474,10 @@ TensorVariable ...@@ -472,6 +474,10 @@ TensorVariable
See :func:`flatten`. See :func:`flatten`.
.. method:: ravel()
return self.flatten(). For NumPy compatibility.
.. attribute:: T .. attribute:: T
Transpose of this tensor. Transpose of this tensor.
...@@ -485,8 +491,31 @@ TensorVariable ...@@ -485,8 +491,31 @@ TensorVariable
same vector! Use `reshape` or `dimshuffle` to turn your vector same vector! Use `reshape` or `dimshuffle` to turn your vector
into a row or column matrix. into a row or column matrix.
.. method:: {any,all}(axis=None, keepdims=False)
.. method:: {sum,prod,mean}(axis=None, dtype=None, keepdims=False, acc_dtype=None)
.. method:: {var,std,min,max,argmin,argmax}(axis=None, keepdims=False),
.. method:: diagonal(offset=0, axis1=0, axis2=1)
.. method:: astype(dtype)
.. method:: take(indices, axis=None, mode='raise')
.. method:: copy()
.. method:: norm(L, axis=None)
.. method:: nonzero(self, return_matrix=False)
.. method:: nonzero_values(self)
.. method:: sort(self, axis=-1, kind='quicksort', order=None)
.. method:: argsort(self, axis=-1, kind='quicksort', order=None)
.. method:: clip(self, a_min, a_max)
.. method:: conf()
.. method:: repeat(repeats, axis=None)
.. method:: round(mode="half_away_from_zero")
.. method:: trace()
.. method:: get_scalar_constant_value()
.. method:: zeros_like(model, dtype=None)
All the above methods are equivalent to NumPy for Theano on the current tensor.
.. method:: __{abs,neg,lt,le,gt,ge,invert,and,or,add,sub,mul,div,truediv,floordiv}__
Those elemwise operation are supported via Python syntax.
Shaping and Shuffling Shaping and Shuffling
===================== =====================
......
...@@ -155,11 +155,11 @@ class WeirdBrokenOp(gof.Op): ...@@ -155,11 +155,11 @@ class WeirdBrokenOp(gof.Op):
prep_vars = """ prep_vars = """
//the output array has size M x N //the output array has size M x N
npy_intp M = PyArray_DIMS(%(a)s)[0]; npy_intp M = PyArray_DIMS(%(a)s)[0];
npy_intp Sa = %(a)s->strides[0] / PyArray_DESCR(%(a)s)->elsize; npy_intp Sa = PyArray_STRIDES(%(a)s)[0] / PyArray_DESCR(%(a)s)->elsize;
npy_intp Sz = %(z)s->strides[0] / PyArray_DESCR(%(z)s)->elsize; npy_intp Sz = PyArray_STRIDES(%(z)s)[0] / PyArray_DESCR(%(z)s)->elsize;
npy_double * Da = (npy_double*)%(a)s->data; npy_double * Da = (npy_double*)PyArray_BYTES(%(a)s);
npy_double * Dz = (npy_double*)%(z)s->data; npy_double * Dz = (npy_double*)PyArray_BYTES(%(z)s);
//clear the output array //clear the output array
for (npy_intp m = 0; m < M; ++m) for (npy_intp m = 0; m < M; ++m)
......
...@@ -1693,7 +1693,7 @@ class GCC_compiler(object): ...@@ -1693,7 +1693,7 @@ class GCC_compiler(object):
#to use the new API, but not everywhere. When finished, enable #to use the new API, but not everywhere. When finished, enable
#the following macro to assert that we don't bring new code #the following macro to assert that we don't bring new code
#that use the old API. #that use the old API.
#cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION") cxxflags.append("-D NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION")
numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]] numpy_ver = [int(n) for n in numpy.__version__.split('.')[:2]]
# numpy 1.7 deprecated the following macro but the new one didn't # numpy 1.7 deprecated the following macro but the new one didn't
......
...@@ -76,10 +76,7 @@ except ImportError: ...@@ -76,10 +76,7 @@ except ImportError:
except ImportError: except ImportError:
_logger.info("Compiling new CVM") _logger.info("Compiling new CVM")
dirname = 'lazylinker_ext' dirname = 'lazylinker_ext'
# We use a .txt extensions as otherwise it don't get cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c')
# included when we create a package to send to pypi
# This happen even if we tell to include *.c files
cfile = os.path.join(theano.__path__[0], 'gof', 'lazylinker_c.c.txt')
code = open(cfile).read() code = open(cfile).read()
loc = os.path.join(config.compiledir, dirname) loc = os.path.join(config.compiledir, dirname)
if not os.path.exists(loc): if not os.path.exists(loc):
......
...@@ -220,6 +220,7 @@ if __name__ == "__main__": ...@@ -220,6 +220,7 @@ if __name__ == "__main__":
GTX 650 Ti 0.27s GTX 650 Ti 0.27s
GTX 460 0.37s 0.45s GTX 460 0.37s 0.45s
GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version? GTX 285 0.42s 0.452s 0.452s 0.40s # cuda 3.0 seems faster? driver version?
750M 0.49s
GTX 550 Ti 0.57s GTX 550 Ti 0.57s
GT 520 2.68s 3.06s GT 520 2.68s 3.06s
520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04 520M 2.44s 3.19s # with bumblebee on Ubuntu 12.04
......
...@@ -2223,12 +2223,6 @@ class GpuReshape(tensor.Reshape, GpuOp): ...@@ -2223,12 +2223,6 @@ class GpuReshape(tensor.Reshape, GpuOp):
out[0] = x.reshape(tuple(shp)) out[0] = x.reshape(tuple(shp))
# C Code shared by GpuSubtensor and GpuIncSubtensor
_define_set_data = """
#define CudaNdarray_set_device_data2(obj, ptr, base) \
CudaNdarray_set_device_data(obj, (float *)ptr, base)
"""
class GpuSubtensor(GpuOp, tensor.Subtensor): class GpuSubtensor(GpuOp, tensor.Subtensor):
""" """
Implement subtensor on the gpu. Implement subtensor on the gpu.
...@@ -2276,16 +2270,27 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2276,16 +2270,27 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
view_ndim = node.outputs[0].ndim view_ndim = node.outputs[0].ndim
fail = sub['fail'] fail = sub['fail']
decl = "CudaNdarray* xview = NULL;"
get_xview = self.helper_c_code(node, name, inputs, outputs, sub,
self.idx_list,
view_ndim=view_ndim,
c_prefix='CudaNdarray',
strides_mul=4,
)
build_view = """ build_view = """
//TODO: give this Op a second output so that this view can be cached //TODO: give this Op a second output so that this view can be cached
//TODO: alternatively, fix the memory leak on failure //TODO: alternatively, fix the memory leak on failure
CudaNdarray* xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s); xview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
if (!xview) if (!xview)
{ {
%(fail)s; %(fail)s;
} }
if (CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL)) if (CudaNdarray_set_device_data(
xview,
CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
(PyObject*) %(x)s))
{ {
PyErr_Format(PyExc_RuntimeError, PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the" "GpuSubtensor is not able to set the"
...@@ -2294,43 +2299,24 @@ class GpuSubtensor(GpuOp, tensor.Subtensor): ...@@ -2294,43 +2299,24 @@ class GpuSubtensor(GpuOp, tensor.Subtensor):
%(fail)s; %(fail)s;
} }
cnda_mark_dev_structure_dirty(xview); cnda_mark_dev_structure_dirty(xview);
""" % locals() for(int idx=0;idx <%(view_ndim)s; idx++){
//For broadcasted dimensions, set the strides to 0
get_xview = _define_set_data + \ //We can't do that only for broadcasted dimensions as this can happen
self.helper_c_code(node, name, inputs, outputs, sub, //for dimensions of size 0. That are rebroadcated later.
self.idx_list, if(xview_dims[idx]==1)
c_prefix='CudaNdarray', CudaNdarray_set_stride(xview, idx, 0);
set_data='CudaNdarray_set_device_data2', else
set_dim='CudaNdarray_set_dim', CudaNdarray_set_stride(xview, idx, xview_strides[idx]);
set_stride='CudaNdarray_set_stride', CudaNdarray_set_dim(xview, idx, xview_dims[idx]);
update_flags="", strides_mul=4)
finish_view = ""
#For broadcasted dimensions, set the strides to 0
#We can't do that only for broadcasted dimensions as this can happen for dimensions of size 0,
#That are rebroadcated later.
for idx in range(node.outputs[0].ndim):
finish_view += """
if(CudaNdarray_HOST_DIMS(xview)[%(idx)s]==1)
CudaNdarray_set_stride(xview, %(idx)s, 0);
""" % locals()
finish_view += """
//Set the base only now
if(CudaNdarray_set_device_data(xview, CudaNdarray_DEV_DATA(xview),
%(x)s)){
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set"
" the base of the view array");
Py_XDECREF(xview);
%(fail)s;
} }
""" % locals()
finish_view = """
Py_XDECREF(%(z)s); Py_XDECREF(%(z)s);
%(z)s = xview; %(z)s = xview;
""" % locals() """ % locals()
return build_view + "{" + get_xview + "}" + finish_view return decl + get_xview + build_view + finish_view
def c_code_cache_version(self): def c_code_cache_version(self):
hv = self.helper_c_code_cache_version() hv = self.helper_c_code_cache_version()
...@@ -2719,6 +2705,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1): ...@@ -2719,6 +2705,7 @@ class GpuAdvancedIncSubtensor1_dev20(GpuAdvancedIncSubtensor1):
""" %locals() """ %locals()
class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
Implement IncSubtensor on the gpu. Implement IncSubtensor on the gpu.
...@@ -2756,6 +2743,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2756,6 +2743,9 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals() return """(CudaNdarray*) CudaNdarray_Copy(%(x)s)""" % locals()
def decl_view(self):
return "CudaNdarray* zview = NULL;"
def make_view_array(self, x, view_ndim): def make_view_array(self, x, view_ndim):
""" """
:param x: a string identifying an array to be viewed :param x: a string identifying an array to be viewed
...@@ -2765,17 +2755,32 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2765,17 +2755,32 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
This doesn't need to actually set up the view with the This doesn't need to actually set up the view with the
right indexing; we'll do that manually later. right indexing; we'll do that manually later.
""" """
return """CudaNdarray* zview = (CudaNdarray*) ret = """zview = (CudaNdarray*) CudaNdarray_New(%(view_ndim)s);
CudaNdarray_New(%(view_ndim)s)""" % locals() if (CudaNdarray_set_device_data(
zview,
CudaNdarray_DEV_DATA(%(x)s) + xview_offset/4,
(PyObject*) %(x)s))
{
zview = NULL;
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
}else{
cnda_mark_dev_structure_dirty(zview);
for(int idx=0;idx <%(view_ndim)s; idx++){
if(xview_dims[idx]==1)
CudaNdarray_set_stride(zview, idx, 0);
else
CudaNdarray_set_stride(zview, idx, xview_strides[idx]);
CudaNdarray_set_dim(zview, idx, xview_dims[idx]);
}
}
""" % locals()
return ret
def get_helper_c_code_args(self): def get_helper_c_code_args(self):
""" Return a dictionary of arguments to use with helper_c_code""" """ Return a dictionary of arguments to use with helper_c_code"""
return { 'update_flags' : "", return {'c_prefix': 'CudaNdarray',
'c_prefix' : 'CudaNdarray',
'set_data' :'CudaNdarray_set_device_data2',
'set_dim' : 'CudaNdarray_set_dim',
'set_stride' : 'CudaNdarray_set_stride',
'update_flags' : "",
'strides_mul': 4 'strides_mul': 4
} }
...@@ -2789,24 +2794,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2789,24 +2794,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" """
return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals() return """CudaNdarray_CopyFromCudaNdarray(%(view)s, %(source)s)""" % locals()
def define_set_data(self):
return _define_set_data
def link_view_array(self, x, fail):
return """
if (CudaNdarray_set_device_data(zview, CudaNdarray_DEV_DATA(%(x)s),
(PyObject*) NULL))
{
PyErr_Format(PyExc_RuntimeError,
"GpuSubtensor is not able to set the"
" devdata field of the view");
Py_XDECREF(zview);
%(fail)s;
}
cnda_mark_dev_structure_dirty(zview);
""" % locals()
def set_view_base(self, x, fail): def set_view_base(self, x, fail):
return """ return """
//Set the base only now //Set the base only now
...@@ -2823,7 +2810,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2823,7 +2810,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
def add_to_zview(self, x, fail): def add_to_zview(self, x, fail):
return """ return """
PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview, PyObject * add_result = CudaNdarray_inplace_add((PyObject *) zview,
(PyObject *) py_%(x)s); (PyObject *) py_%(x)s);
...@@ -2839,7 +2825,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp): ...@@ -2839,7 +2825,6 @@ class GpuIncSubtensor(tensor.IncSubtensor, GpuOp):
""" % locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
parent_version = super(GpuIncSubtensor, self).c_code_cache_version() parent_version = super(GpuIncSubtensor, self).c_code_cache_version()
if parent_version: if parent_version:
return parent_version + (0,) return parent_version + (0,)
......
...@@ -5,13 +5,14 @@ Generator code in SSJ package (L'Ecuyer & Simard) ...@@ -5,13 +5,14 @@ Generator code in SSJ package (L'Ecuyer & Simard)
http://www.iro.umontreal.ca/~simardr/ssj/indexe.html http://www.iro.umontreal.ca/~simardr/ssj/indexe.html
""" """
import sys, warnings import warnings
import numpy import numpy
from theano import Op, Apply, shared, config, Variable from theano import Op, Apply, shared, config, Variable
from theano.tensor import (raw_random, TensorType, as_tensor_variable, from theano.tensor import (raw_random, TensorType, as_tensor_variable,
get_vector_length, cast, opt, scal) get_vector_length, cast, opt, scal)
from theano.tensor import zeros_like, sqrt, log, sin, cos, join, prod from theano.tensor import sqrt, log, sin, cos, join, prod
from theano.compile import optdb from theano.compile import optdb
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.python25 import all, any from theano.gof.python25 import all, any
...@@ -36,6 +37,7 @@ def matVecModM(A, s, m): ...@@ -36,6 +37,7 @@ def matVecModM(A, s, m):
x[i] = r + m x[i] = r + m
return x return x
def multMatVect(v, A, m1, B, m2): def multMatVect(v, A, m1, B, m2):
#multiply the first half of v by A with a modulo of m1 #multiply the first half of v by A with a modulo of m1
#and the second half by B with a modulo of m2 #and the second half by B with a modulo of m2
...@@ -79,9 +81,11 @@ A2p134 = numpy.asarray( ...@@ -79,9 +81,11 @@ A2p134 = numpy.asarray(
[1401213391, 1178684362, 1431130166]]) [1401213391, 1178684362, 1431130166]])
np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)] np_int32_vals = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
def ff_2p134(rstate): def ff_2p134(rstate):
return multMatVect(rstate, A1p134, M1, A2p134, M2) return multMatVect(rstate, A1p134, M1, A2p134, M2)
def ff_2p72(rstate): def ff_2p72(rstate):
return multMatVect(rstate, A1p72, M1, A2p72, M2) return multMatVect(rstate, A1p72, M1, A2p72, M2)
...@@ -93,8 +97,8 @@ def mrg_next_value(rstate, new_rstate): ...@@ -93,8 +97,8 @@ def mrg_next_value(rstate, new_rstate):
#i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)] #i0, i7, i9, i15, i16, i22, i24 = [numpy.int32(i) for i in (0, 7, 9, 15, 16, 22, 24)]
i0, i7, i9, i15, i16, i22, i24 = np_int32_vals i0, i7, i9, i15, i16, i22, i24 = np_int32_vals
#first component #first component
y1 = (((x12 & MASK12) << i22) + (x12 >> i9) y1 = (((x12 & MASK12) << i22) + (x12 >> i9) +
+ ((x13 & MASK13) << i7) + (x13 >> i24)) ((x13 & MASK13) << i7) + (x13 >> i24))
assert type(y1) == numpy.int32 assert type(y1) == numpy.int32
if (y1 < 0 or y1 >= M1): #must also check overflow if (y1 < 0 or y1 >= M1): #must also check overflow
...@@ -135,6 +139,7 @@ def mrg_next_value(rstate, new_rstate): ...@@ -135,6 +139,7 @@ def mrg_next_value(rstate, new_rstate):
else: else:
return (x11 - x21) * NORM return (x11 - x21) * NORM
class mrg_uniform_base(Op): class mrg_uniform_base(Op):
def __init__(self, output_type, inplace=False): def __init__(self, output_type, inplace=False):
Op.__init__(self) Op.__init__(self)
...@@ -145,17 +150,19 @@ class mrg_uniform_base(Op): ...@@ -145,17 +150,19 @@ class mrg_uniform_base(Op):
self.warned_numpy_version = False self.warned_numpy_version = False
def __eq__(self, other): def __eq__(self, other):
return type(self) == type(other) \ return (type(self) == type(other) and
and self.output_type == other.output_type \ self.output_type == other.output_type and
and self.inplace == other.inplace self.inplace == other.inplace)
def __hash__(self): def __hash__(self):
return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace) return hash(type(self)) ^ hash(self.output_type) ^ hash(self.inplace)
def __str__(self): def __str__(self):
if self.inplace: if self.inplace:
s = "inplace" s = "inplace"
else: s = "no_inplace" else:
return self.__class__.__name__+"{%s,%s}"%(self.output_type,s) s = "no_inplace"
return self.__class__.__name__ + "{%s,%s}" % (self.output_type, s)
def make_node(self, rstate, size): def make_node(self, rstate, size):
# error checking slightly redundant here, since # error checking slightly redundant here, since
...@@ -166,7 +173,7 @@ class mrg_uniform_base(Op): ...@@ -166,7 +173,7 @@ class mrg_uniform_base(Op):
[rstate, size], [rstate, size],
[rstate.type(), self.output_type()]) [rstate.type(), self.output_type()])
def grad(self,inputs,ograd): def grad(self, inputs, ograd):
return [None for i in inputs] return [None for i in inputs]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
...@@ -187,8 +194,8 @@ class mrg_uniform(mrg_uniform_base): ...@@ -187,8 +194,8 @@ class mrg_uniform(mrg_uniform_base):
def perform(self, node, inp, out): def perform(self, node, inp, out):
rstate, size = inp rstate, size = inp
o_rstate, o_sample = out o_rstate, o_sample = out
numpy_version=numpy.__version__.split('.') numpy_version = numpy.__version__.split('.')
if not self.warned_numpy_version and int(numpy_version[0])<=1 and int(numpy_version[1])<3: if not self.warned_numpy_version and int(numpy_version[0]) <= 1 and int(numpy_version[1]) <3 :
print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy" print "Warning: you must use numpy version 1.3.0 or higher with the python version of this op. Otherwise numpy leak memory. and numpy"
self.warned_numpy_version = True self.warned_numpy_version = True
...@@ -201,20 +208,21 @@ class mrg_uniform(mrg_uniform_base): ...@@ -201,20 +208,21 @@ class mrg_uniform(mrg_uniform_base):
for s in size: for s in size:
n_elements *= s n_elements *= s
n_streams,_ = rstate.shape n_streams, _ = rstate.shape
rval = numpy.zeros(n_elements, dtype=self.output_type.dtype) rval = numpy.zeros(n_elements, dtype=self.output_type.dtype)
err_orig = numpy.seterr(over='ignore') err_orig = numpy.seterr(over='ignore')
try: try:
for i in xrange(n_elements): for i in xrange(n_elements):
sample = mrg_next_value(rstate[i%n_streams], rstate[i%n_streams]) sample = mrg_next_value(rstate[i % n_streams],
rstate[i % n_streams])
rval[i] = sample rval[i] = sample
finally: finally:
numpy.seterr(**err_orig) numpy.seterr(**err_orig)
o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary o_rstate[0] = node.outputs[0].type.filter(rstate) # send to GPU if necessary
o_sample[0] = node.outputs[1].type.filter(rval.reshape(size))# send to GPU if necessary o_sample[0] = node.outputs[1].type.filter(rval.reshape(size)) # send to GPU if necessary
def c_code(self, node, name, inp, out, sub): def c_code(self, node, name, inp, out, sub):
rstate, size = inp rstate, size = inp
...@@ -228,7 +236,7 @@ class mrg_uniform(mrg_uniform_base): ...@@ -228,7 +236,7 @@ class mrg_uniform(mrg_uniform_base):
fail = sub['fail'] fail = sub['fail']
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that # this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0 # numpy.float32(number * M1) < 1.0
else: else:
...@@ -279,7 +287,7 @@ class mrg_uniform(mrg_uniform_base): ...@@ -279,7 +287,7 @@ class mrg_uniform(mrg_uniform_base):
} }
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0]; odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]); must_alloc_sample = must_alloc_sample || (PyArray_DIMS(%(o_sample)s)[i] != odims[i]);
//fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]); //fprintf(stderr, "size %%i %%i\\n", i, (int)odims[i]);
...@@ -313,8 +321,8 @@ class mrg_uniform(mrg_uniform_base): ...@@ -313,8 +321,8 @@ class mrg_uniform(mrg_uniform_base):
} }
n_streams = PyArray_DIMS(%(o_rstate)s)[0]; n_streams = PyArray_DIMS(%(o_rstate)s)[0];
sample_data = (%(otype)s *) %(o_sample)s->data; sample_data = (%(otype)s *) PyArray_DATA(%(o_sample)s);
state_data = (npy_int32 *) %(o_rstate)s->data; state_data = (npy_int32 *) PyArray_DATA(%(o_rstate)s);
for (int i = 0; i < n_elements; ++i) for (int i = 0; i < n_elements; ++i)
{ {
npy_int32 * state_data_i = state_data + (i%%n_streams)*6; npy_int32 * state_data_i = state_data + (i%%n_streams)*6;
...@@ -392,7 +400,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -392,7 +400,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
def c_support_code_apply(self, node, nodename): def c_support_code_apply(self, node, nodename):
if self.output_type.dtype == 'float32': if self.output_type.dtype == 'float32':
otype = 'float' otype = 'float'
NORM = '4.6566126e-10f' #numpy.float32(1.0/(2**31+65)) NORM = '4.6566126e-10f' # numpy.float32(1.0/(2**31+65))
# this was determined by finding the biggest number such that # this was determined by finding the biggest number such that
# numpy.float32(number * M1) < 1.0 # numpy.float32(number * M1) < 1.0
else: else:
...@@ -476,7 +484,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -476,7 +484,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
} }
} }
""" %locals() """ % locals()
def c_code(self, node, nodename, inp, out, sub): def c_code(self, node, nodename, inp, out, sub):
rstate, size = inp rstate, size = inp
...@@ -491,7 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -491,7 +499,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
else: else:
otype = 'double' otype = 'double'
SYNC="CNDA_THREAD_SYNC"; SYNC = "CNDA_THREAD_SYNC"
return """ return """
//////// <code generated by mrg_uniform> //////// <code generated by mrg_uniform>
...@@ -521,7 +529,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -521,7 +529,7 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
} }
for (int i = 0; i < %(ndim)s; ++i) for (int i = 0; i < %(ndim)s; ++i)
{ {
odims[i] = ((npy_int32*)(%(size)s->data + %(size)s->strides[0] * i))[0]; odims[i] = ((npy_int32*)(PyArray_BYTES(%(size)s) + PyArray_STRIDES(%(size)s)[0] * i))[0];
n_elements *= odims[i]; n_elements *= odims[i];
must_alloc_sample = (must_alloc_sample must_alloc_sample = (must_alloc_sample
|| CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]); || CudaNdarray_HOST_DIMS(%(o_sample)s)[i] != odims[i]);
...@@ -593,7 +601,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp): ...@@ -593,7 +601,8 @@ class GPU_mrg_uniform(mrg_uniform_base, GpuOp):
} }
//////// </ code generated by mrg_uniform> //////// </ code generated by mrg_uniform>
""" %locals() """ % locals()
def c_code_cache_version(self): def c_code_cache_version(self):
return (7,) return (7,)
...@@ -662,7 +671,7 @@ class MRG_RandomStreams(object): ...@@ -662,7 +671,7 @@ class MRG_RandomStreams(object):
elif seed >= M2: elif seed >= M2:
raise ValueError('seed should be less than %i' % M2, seed) raise ValueError('seed should be less than %i' % M2, seed)
self.rstate = numpy.asarray([seed]*6, dtype='int32') self.rstate = numpy.asarray([seed]*6, dtype='int32')
elif len(seed)==6: elif len(seed) == 6:
if seed[0] == 0 and seed[1] == 0 and seed[2] == 0: if seed[0] == 0 and seed[1] == 0 and seed[2] == 0:
raise ValueError('The first 3 values of seed should not be all 0', seed) raise ValueError('The first 3 values of seed should not be all 0', seed)
if seed[3] == 0 and seed[4] == 0 and seed[5] == 0: if seed[3] == 0 and seed[4] == 0 and seed[5] == 0:
...@@ -690,7 +699,7 @@ class MRG_RandomStreams(object): ...@@ -690,7 +699,7 @@ class MRG_RandomStreams(object):
""" """
assert n_streams < 2**72 assert n_streams < 2**72
assert n_streams > 0 assert n_streams > 0
rval = numpy.zeros((n_streams,6), dtype='int32') rval = numpy.zeros((n_streams, 6), dtype='int32')
rval[0] = self.rstate rval[0] = self.rstate
for i in xrange(1, n_streams): for i in xrange(1, n_streams):
rval[i] = ff_2p72(rval[i - 1]) rval[i] = ff_2p72(rval[i - 1])
...@@ -776,11 +785,13 @@ class MRG_RandomStreams(object): ...@@ -776,11 +785,13 @@ class MRG_RandomStreams(object):
# currently no Theano node that will do a frombuffer # currently no Theano node that will do a frombuffer
# reinterpretation. # reinterpretation.
u = self.pretty_return(node_rstate, u = self.pretty_return(node_rstate,
*GPU_mrg_uniform.new(node_rstate, ndim, dtype, size)) *GPU_mrg_uniform.new(node_rstate,
ndim, dtype, size))
else: else:
node_rstate = shared(self.get_substream_rstates(nstreams)) node_rstate = shared(self.get_substream_rstates(nstreams))
u = self.pretty_return(node_rstate, u = self.pretty_return(node_rstate,
*mrg_uniform.new(node_rstate, ndim, dtype, size)) *mrg_uniform.new(node_rstate,
ndim, dtype, size))
r = u * (high - low) + low r = u * (high - low) + low
if u.type.broadcastable != r.type.broadcastable: if u.type.broadcastable != r.type.broadcastable:
...@@ -934,4 +945,6 @@ def mrg_random_make_inplace(node): ...@@ -934,4 +945,6 @@ def mrg_random_make_inplace(node):
new_op = op.__class__(op.output_type, inplace=True) new_op = op.__class__(op.output_type, inplace=True)
return new_op.make_node(*node.inputs).outputs return new_op.make_node(*node.inputs).outputs
return False return False
optdb.register('random_make_inplace_mrg', opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, 'fast_run', 'inplace') optdb.register('random_make_inplace_mrg',
opt.in2out(mrg_random_make_inplace, ignore_newtrees=True),
99, 'fast_run', 'inplace')
This source diff could not be displayed because it is too large. You can view the blob instead.
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
差异被折叠。
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论