提交 cd9e62a0 authored 作者: James Bergstra's avatar James Bergstra

merge NC

差异被折叠。
.. _extending_theano:
****************
Extending Theano
****************
Theano graphs
-------------
- Theano works with symbolic graphs
- Those graphs are bi-partite graphs (graph with 2 types of nodes)
- Those 2 nodes types are Apply and Variable nodes
Inputs and Outputs are lists of Theano variables
.. image:: pics/apply_node.png
:width: 500 px
Op contract
-----------
.. code-block:: python
import theano
class MyOp(Op):
def __eq__(self, other):
def __hash__(self):
def __str__(self):
def make_node(self, x):
# Python implementation:
def perform(self, node, inputs_storage, output_storage):
# C implementation: [see theano web site]
# others implementation (pycuda, ...):
def make_thunk(self, node, storage_map, _, _2):
# optional:
def __init__(self, ...):
def grad(self, inputs, g):
def infer_shape(node, (i0_shapes, ...))
Op example
----------
.. code-block:: python
import theano
class DoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, x):
x = theano.tensor.as_tensor_variable(x)
return theano.Apply(self, [x], [x.type()])
def perform(self, node, inputs, output_storage):
x = inputs[0]
z = output_storage[0]
z[0] = x * 2
Test it!
>>> x = theano.tensor.matrix()
>>> f = theano.function([x],DoubleOp()(x))
>>> import numpy
>>> inp = numpy.random.rand(5,5)
>>> out = f(inp)
>>> assert numpy.allclose(inp*2, out)
>>> print inp
>>> print out
Exercises 7
-----------
- Run the code in the file double_op.py.
- Modify and execute to compute: x * y
- Modify and execute the example to return 2 outputs: x + y and x - y
- Our current elemwise fusion generate computation with only 1 outputs
Theano + PyCUDA
---------------
.. code-block:: python
import numpy, theano
import theano.misc.pycuda_init
from pycuda.compiler import SourceModule
import theano.sandbox.cuda as cuda
class PyCUDADoubleOp(theano.Op):
def __eq__(self, other):
return type(self) == type(other)
def __hash__(self):
return hash(type(self))
def __str__(self):
return self.__class__.__name__
def make_node(self, inp):
inp = cuda.basic_ops.gpu_contiguous(
cuda.basic_ops.as_cuda_ndarray_variable(inp))
assert inp.dtype == "float32"
return theano.Apply(self, [inp], [inp.type()])
def make_thunk(self, node, storage_map, _, _2):
mod = SourceModule("""
__global__ void my_fct(float * i0, float * o0, int size) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<size){
o0[i] = i0[i]*2;
}
}""")
pycuda_fct = mod.get_function("my_fct")
inputs = [ storage_map[v] for v in node.inputs]
outputs = [ storage_map[v] for v in node.outputs]
def thunk():
z = outputs[0]
if z[0] is None or z[0].shape!=inputs[0][0].shape:
z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape)
grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1)
pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size),
block=(512,1,1), grid=grid)
return thunk
Test it!
>>> x = theano.tensor.fmatrix()
>>> f = theano.function([x], PyCUDADoubleOp()(x))
>>> xv=numpy.ones((4,5), dtype="float32")
>>> assert numpy.allclose(f(xv), xv*2)
>>> print numpy.asarray(f(xv))
Exercises 8
-----------
- Run the above example
- Modify and execute the example to multiple two matrix: x * y
- Modify and execute the example to return 2 outputs: x + y and x - y
- Our current elemwise fusion generate computation with only 1 outputs
- Modify and execute the example to support stride? (Don't force the input to be c contiguous)
.. _gpundarray:
**********
GpuNdArray
**********
Why a common GPU ndarray?
- Currently there are at least 4 different GPU array data structures in use by Python packages
- CudaNdarray (Theano), GPUArray (PyCUDA), CUDAMatrix (cudamat), GPUArray (PyOpenCL), ...
- There are even more if we include other languages
- All of them are a subset of the functionality of ``numpy.ndarray`` on the GPU
- Lots of duplicated effort
- GPU code is harder/slower to do {\bf correctly} and {\bf fast} than on the CPU/Python
- Lack of a common array API makes it harder to port/reuse code
- Also harder to find/distribute code
- Divides development work
Design Goals
- Make it VERY similar to ``numpy.ndarray``
- Be compatible with both CUDA and OpenCL
- Have the base object accessible from C to allow collaboration with more projects, across high-level languages
- We want people from C, C++, Ruby, R, ... all use the same base GPU N-dimensional array
Final GpuNdArray Note
- Under development
- Will be the next GPU array container for Theano (this summer!)
- Probably also for PyCUDA, PyOpenCL
- Mailing list: http://lists.tiker.net/listinfo/gpundarray
.. _index:
=========================
GPU programming made Easy
=========================
.. toctree::
introduction
theano
advanced_theano
pyCUDA
extending_theano
gpundarray
.. _introduction:
************
Introduction
************
Theano motivations
------------------
Theano tries to be the **holy grail** in computing: *easy to code* and *it fast to execute* !
it works only on mathematical expressions, so you won't have:
- Function call inside a theano function
- Structure, enum
- Dynamic type (Theano is Fully typed)
Unfortunately it doesn't do coffee... yet.
.. image:: pics/Caffeine_Machine_no_background_red.png
Theano status
-------------
Why you can rely on Theano:
- Theano has been developed and used since January 2008 (3.5 yrs old)
- Core technology for a funded Silicon-Valley startup
- Driven over 40 research papers in the last few years
- Good user documentation
- Active mailing list with participants from outside our lab
- Many contributors (some from outside our lab)
- Used to teach IFT6266 for two years
- Used by everyone in our lab (\textasciitilde 30 people)
- Deep Learning Tutorials
- Unofficial RPMs for Mandriva
- Downloads (June 8 2011, since last January): Pypi 780, MLOSS: 483, Assembla (``bleeding edge'' repository): unknown
Why scripting for GPUs ?
------------------------
**GPUs?**
- Faster, cheaper, more efficient power usage
- How much faster? I have seen numbers from 100x slower to 1000x faster.
- It depends on the algorithms
- How the benchmark is done
- Quality of implementation
- How much time was spent optimizing CPU vs GPU code
- In Theory:
- Intel Core i7 980 XE (107Gf/s float64) 6 cores
- NVIDIA C2050 (515 Gf/s float64, 1Tf/s float32) 480 cores
- NVIDIA GTX580 (1.5Tf/s float32) 512 cores
- Theano goes up to 100x faster on th GPU because we don't use multiple core on CPU
- Theano can be linked with multi-core capable BLAS (GEMM and GEMV)
- If you see 1000x, it probably means the benchmark is not fair
**Scripting for GPUs?**
They *Complement each other*
- GPUs are everything that scripting/high level languages are not
- Highly parallel
- Very architecture-sensitive
- Built for maximum FP/memory throughput
- CPU: largely restricted to control
- Optimized for sequential code and low latency (rather than high throughput)
- Tasks (1000/sec)
- Scripting fast enough
Theano vs PyCUDA vs PyOpenCL vs CUDA
------------------------------------
- Theano
- Mathematical expression compiler
- Generates costum C and CUDA code
- Uses Python code when performance is not critical
- CUDA
- C extension by NVIDA that allow to code and use GPU
- PyCUDA (Python + CUDA)
- Python interface to CUDA
- Memory management of GPU objects
- Compilation of code for the low-level driver
- PyOpenCL (Python + OpenCL)
- PyCUDA for OpenCL
Python
------
- Interpreted language
- General-purpose high-level programming language
- OO and scripting language
- Emphasizes code readability
- Large and comprehensive standard library
- Indentation for block delimiters
- Dynamic type and memory management
- Dictionary ``d={'var1':'value1', 'var2':42, ...}``
- List comprehension: ``[i+3 for i in range(10)]``
NumPy
-----
- Base scientific computing package in Python on the CPU
- A powerful N-dimensional array object
- ndarray.{ndim, shape, size, dtype, itemsize, stride}
- Sophisticated broadcasting functions
- ``numpy.random.rand(4,5) * numpy.random.rand(1,5)`` -> mat(4,5)
- ``numpy.random.rand(4,5) * numpy.random.rand(4,1)`` -> mat(4,5)
- ``numpy.random.rand(4,5) * numpy.random.rand(5)`` -> mat(4,5)
- Tools for integrating C/C++ and Fortran code
- Linear algebra, Fourier transform and pseudorandom number generation
.. _pyCUDA:
******
PyCUDA
******
Introduction
------------
Authors: Andreas Klockner
- PyCUDA can access Nvidia's CUDA parallel computation API from Python
- Object cleanup tied to lifetime of objects (RAII, Resource Acquisition Is Initialization).
- Makes it much easier to write correct, leak- and crash-free code
- PyCUDA knows about dependencies (e.g.. it won't detach from a context before all memory allocated in it is also freed)
- Convenience
- Abstractions to compile CUDA code from Python: ``pycuda.driver.SourceModule``
- A GPU memory buffer: \texttt{pycuda.gpuarray.GPUArray}
- Completeness
- Binding to all of CUDA's driver API
- Automatic Error Checking
- All CUDA errors are automatically translated into Python exceptions
- Speed
- PyCUDA's base layer is written in C++
- Helpful documentation
Example
-------
.. code-block:: python
import pycuda.autoinit
import pycuda.driver as drv
import numpy
from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
const int i = threadIdx.x;
dest[i] = a[i] * b[i];
}
""")
multiply_them = mod.get_function("multiply_them")
a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)
multiply_them(
drv.Out(dest), drv.In(a), drv.In(b),
block=(400,1,1), grid=(1,1))
assert numpy.allclose(dest, a*b)
print dest
Exercice 6
----------
- Run the above example
- Modify and execute it to work for a matrix of 20 x 10
差异被折叠。
......@@ -1183,36 +1183,46 @@ class _Linker(gof.link.LocalLinker):
thunks_py = [] #python thunks
thunks_c = [] #c thunks
compute_map = {}
for k in storage_map:
compute_map[k] = [k.owner is None]
for node in order:
node_input_storage = [storage_map[r] for r in node.inputs]
node_output_storage = [storage_map[r] for r in node.outputs]
try:
if not self.maker.mode.check_c_code:
raise utils.MethodNotDefined()
e = Env(*graph.clone(node.inputs, node.outputs))
e.toposort = lambda: e.nodes #WARNING: STOCHASTIC ORDER
# Specifically... e.nodes is a set, but of only 1 element
cl = CLinker().accept(e, [r for r, r2 in zip(e.outputs, node.outputs) if r2 in no_recycling])
thunk, node_input_filters, node_output_filters = cl.make_thunk(
input_storage = node_input_storage,
output_storage = node_output_storage)
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunks_c.append(thunk)
except (NotImplementedError, utils.MethodNotDefined):
thunks_c.append(None)
if hasattr(node.op, '_op_use_c_code'):
old_value = node.op._op_use_c_code
else:
old_value = False
try:
# ! Problem ! We do not know if make_thunk succedded into
# generating a cthunk, or if it reverted back to a python
# thunk, or if it is none of the above ...
node.op._op_use_c_code = True
tmp_thunk = node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)
if hasattr(tmp_thunk, 'cthunk'):
# Arbritrary check to see if it has a C implementation
thunks_c.append(tmp_thunk)
else:
thunks_c.append(None)
finally:
node.op._op_use_c_code = old_value
if self.maker.mode.check_py_code or thunks_c[-1] is None:
p = node.op.perform
thunk = (lambda p = p, i = node_input_storage, o = node_output_storage, n =
node: p(n, [x[0] for x in i], o))
thunk.inputs = node_input_storage
thunk.outputs = node_output_storage
thunk.perform = p
thunks_py.append(thunk)
try:
node.op._op_use_c_code = False
thunks_py += [node.op.make_thunk(node,
storage_map,
compute_map,
no_recycling)]
finally:
node.op._op_use_c_code = old_value
else:
thunks_py.append(None)
......@@ -1233,6 +1243,11 @@ class _Linker(gof.link.LocalLinker):
# This is the function that runs when you evaluate the graph
#####
def f():
####
# Note: `f` ignores the compute_map and evaluates the nodes in
# topological order. In some sense, this is ok, and can be used
# for now.
#####
_logger.debug("starting a DebugMode call")
for x in no_recycling:
x[0] = None
......
......@@ -401,7 +401,9 @@ class PerformLinker(LocalLinker):
for node in order:
# Maker sure we don't use C version of the code, but rather only
# the python version
old_value = node.op._op_use_c_code
# Note : ops that implement their own make thunk don't usually
# have this attribute defiend !!
old_value = getattr(node.op, '_op_use_c_code', False)
try:
node.op._op_use_c_code = False
thunks += [node.op.make_thunk(node,
......
......@@ -1063,6 +1063,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
start_from = env.outputs
changed = True
max_use_abort = False
opt_name = None
process_count = {}
while changed and not max_use_abort:
......@@ -1099,6 +1100,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
process_count.setdefault(lopt, 0)
if process_count[lopt] > max_use:
max_use_abort = True
opt_name = lopt.name
else:
lopt_change = self.process_node(env, node, lopt)
if lopt_change:
......@@ -1110,7 +1112,7 @@ class EquilibriumOptimizer(NavigatorOptimizer):
self.detach_updater(env, u)
self.detach_updater(env, u) #TODO: erase this line, it's redundant at best
if max_use_abort:
_logger.error("EquilibriumOptimizer max'ed out")
_logger.error("EquilibriumOptimizer max'ed out by "+opt_name)
def print_summary(self, stream=sys.stdout, level=0):
print >> stream, "%s%s id=%i" %(' '*level, self.__class__.__name__, id(self))
......
......@@ -168,7 +168,10 @@ class EquilibriumDB(DB):
opts = super(EquilibriumDB, self).query(*tags, **kwtags)
return opt.EquilibriumOptimizer(opts,
max_depth=5,
max_use_ratio=11,#upgraded to 11 to don't generated useless output in test.
max_use_ratio=50,#upgraded to 50 to avoid equibriumOptimizer
# to be max'ed out by constant folding (can
# I increase the max ratio only for
# constant folding somehow?
failure_callback=opt.NavigatorOptimizer.warn_inplace)
......
......@@ -71,6 +71,14 @@ def test_pycuda_memory_to_theano():
print "gpuarray ref count after creating a CudaNdarray", sys.getrefcount(y)
assert sys.getrefcount(y)==3
assert (numpy.asarray(z) == 0).all()
assert z.base is y
# Test that we can take a view from this cuda view on pycuda memory
zz = z.view()
assert sys.getrefcount(y) == 4
assert zz.base is y
del zz
assert sys.getrefcount(y) == 3
cuda_ones = cuda_ndarray.CudaNdarray(numpy.asarray([[[1]]],dtype='float32'))
z += cuda_ones
......
......@@ -50,13 +50,7 @@ class HostFromGpu(Op):
z[0] = numpy.asarray(x)
def grad(self, inputs, grads):
gz, = grads
if isinstance(gz, tensor.TensorType):
# This would only happen if you call Lop, and provide a tensor
# that is not cuda
# This might require another look to be sure
return [gpu_from_host(gz)]
else:
return [gz]
return [gpu_from_host(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
......@@ -85,13 +79,7 @@ class GpuFromHost(Op):
z[0] = type_support_filter(theano._asarray(x, dtype='float32'), tuple([0]*x.ndim), 0, z[0])
def grad(self, inputs, grads):
gz, = grads
if isinstance(gz,CudaNdarrayType):
# This would only happen if you call Lop, and provide a tensor
# that is not cuda
# This might require another look to be sure
return [host_from_gpu(gz)]
else:
return [gz]
return [host_from_gpu(gz)]
def R_op(self, inputs, eval_points):
ev, = eval_points
......
......@@ -2585,13 +2585,10 @@ int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * bas
// Get the original base object (base.base.base...)
PyObject * orig_base = base;
// base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
if (orig_base && CudaNdarray_Check(orig_base))
while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
{
while (((CudaNdarray*) orig_base)->base)
{
// base_base is itself a view
orig_base = ((CudaNdarray*) orig_base)->base;
}
// base_base is itself a view
orig_base = ((CudaNdarray*) orig_base)->base;
}
//N.B. XDECREF and XINCREF are no-ops for NULL pointers
if (self->base != orig_base)
......
......@@ -590,7 +590,7 @@ def local_gpu_advanced_incsubtensor1(node):
gpu_from_host(y), *coords)]
# Should not execute for GpuAdvancedIncSubtensor1
if node.op.__class__ is tensor.AdvancedSubtensor1 and node.inputs[0].dtype=="float32":
if node.op.__class__ is tensor.AdvancedIncSubtensor1 and node.inputs[0].dtype=="float32":
x, y = node.inputs[0:2]
coords = node.inputs[2:]
go_gpu = False
......
......@@ -806,6 +806,22 @@ class T_subtensor(theano.tensor.tests.test_basic.T_subtensor):
def __init__(self, name):
return super(theano.tensor.tests.test_basic.T_subtensor, self).__init__(name)
def test_advinc_subtensor1():
""" Test the second case in the opt local_gpu_advanced_incsubtensor1 """
shared = cuda.shared_constructor
#shared = tensor.shared
xval = numpy.asarray([[1,2,3], [4,5,6], [7,8,9]],
dtype='float32')
yval = numpy.asarray([[10,10,10], [10,10,10]],
dtype='float32')
x = shared(xval, name = 'x')
y = T.fmatrices('y')
expr = T.advanced_inc_subtensor1(x,y,[0,2])
f=theano.function([y], expr, mode=mode_with_gpu)
assert sum([isinstance(node.op,cuda.GpuAdvancedIncSubtensor1) for node in f.maker.env.toposort() ])==1
assert numpy.allclose(f(yval),[[11.,12.,13.], [4.,5.,6.], [17.,18.,19.]])
def test_inc_subtensor():
shared = cuda.shared_constructor
#shared = tensor.shared
......@@ -832,7 +848,6 @@ def test_set_subtensor():
dtype='float32')
expr = T.set_subtensor(x[:,1:3], y[:,1:3])
f=theano.function([x,y], expr, mode=mode_with_gpu)
print f.maker.env.toposort()
assert sum([isinstance(node.op,cuda.GpuSubtensor) for node in f.maker.env.toposort() ])==1
assert sum([isinstance(node.op,cuda.GpuIncSubtensor) and node.op.set_instead_of_inc==True for node in f.maker.env.toposort() ])==1
print f(xval,yval)
......
......@@ -116,7 +116,7 @@ def test_run_nnet():
rval_gpu, tg = run_nnet(True, n_in=n_in, n_hid=n_hid)
#print "cpu:", rval_cpu
#print "gpu:", rval_gpu
abs_diff, rel_diff = theano.tensor.basic.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
abs_diff, rel_diff = theano.tensor.tensor_grad.numeric_grad.abs_rel_err(rval_gpu,rval_cpu)
max_abs_diff = abs_diff.max()
print "max abs diff=%e max rel diff=%e n_in=%d n_hid=%d"%(
max_abs_diff, rel_diff.max(), n_in, n_hid)
......
......@@ -41,4 +41,4 @@ __contact__ = "Razvan Pascanu <r.pascanu@gmail>"
import scan_opt
from scan import scan
from scan_views import map, reduce, foldl, foldr
from scan_utils import clone
from scan_utils import clone, until
差异被折叠。
......@@ -102,31 +102,18 @@ def reduce( fn
:param name: See ``scan``.
"""
# Makes sure the outputs_info is a list.
if not isinstance(outputs_info, (list,tuple)):
outs_info = [outputs_info]
else:
outs_info = list(outputs_info)
for i,out_info in enumerate(outs_info):
if out_info:
if not isinstance(out_info, dict):
# Specifies that it should return only the last step.
outs_info[i] = dict(
initial = out_info, return_steps = 1)
else:
# Specifies that it should return only the last step.
outs_info[i]['return_steps'] = 1
# NOTE : If the user asks for more then the last step,
# it means he does not understand ``reduce``. We could
# issue a warning in that case
return scan.scan( fn = fn
rval = scan.scan( fn = fn
, sequences = sequences
, outputs_info = outs_info
, outputs_info = outputs_info
, non_sequences = non_sequences
, go_backwards = go_backwards
, truncate_gradient = -1
, mode = mode
, name = name )
if isinstance(rval[0], (list,tuple)):
return [ x[-1] for x in rval[0]], rval[1]
else:
return rval[0][-1], rval[1]
# The ``foldl`` view of Scan Op.
......
......@@ -2838,7 +2838,7 @@ def extract_constant(x):
if x.owner and isinstance(x.owner.op, ScalarFromTensor):
x = x.owner.inputs[0]
else:
x = tensor.tensor_from_scalar(x)
x = tensor_from_scalar(x)
return x
......
......@@ -1245,32 +1245,59 @@ def local_useless_subtensor(node):
shape_of = node.env.shape_feature.shape_of
node_input_idx = 1
for pos, idx in enumerate(node.op.idx_list):
if not isinstance(idx, slice):
# If idx is not a slice, this means we remove this dimension
# from the output, so the subtensor is not useless
return False
if idx.start not in [0,None]:
# If the start of the slice is different from 0, or is a
# variable, then we assume the subtensor is not useless
return False
if idx.step not in [1, None]:
# If we are going backwards, or skipping elements, then this
# is not a useless subtensor
return False
length_pos_data = sys.maxint
length_pos_shape_i = None
try:
length_pos = shape_of[node.inputs[0]][pos]
if isinstance(length_pos, theano.tensor.basic.TensorConstant):
length_pos_data = length_pos.data
else:
length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
try:
length_pos_data = get_constant_value(length_pos)
except TypeError:
pass
if isinstance(idx.stop, theano.scalar.Scalar):
if isinstance(node.inputs[node_input_idx].owner.op,
T.ScalarFromTensor):
length_pos_shape_i = node.inputs[node_input_idx].owner.inputs[0]
else:
length_pos_shape_i = node.inputs[node_input_idx]
assert length_pos_shape_i.type == idx.stop
# We already know that start and step are not variables
# and so they don't appear in the input of the node
node_input_idx += 1
# Catch exception from shape_of
except Exception, e:
length_pos = None
if ( isinstance(idx,slice) and
idx.start in [0,None] and
idx.step in [1,None] and
(idx.stop in [sys.maxint, None, length_pos_data] or
(isinstance(idx.stop, int) and idx.stop>=length_pos_data) or
(isinstance(idx.stop, theano.scalar.Scalar) and
length_pos==length_pos_shape_i)
)):
if isinstance(idx.stop, int):
if idx.stop < length_pos_data:
return False
elif isinstance(idx.stop, theano.scalar.Scalar):
if length_pos_shape_i is None:
return False
if length_pos is None:
return False
if length_pos_shape_i != length_pos:
return False
elif idx.stop is None:
pass
else:
return False
if isinstance(idx, slice):
node_input_idx += sum([isinstance(idx.start, theano.scalar.Scalar),
isinstance(idx.stop, theano.scalar.Scalar),
isinstance(idx.step, theano.scalar.Scalar)])
return [node.inputs[0]]
......
......@@ -136,6 +136,7 @@ def safe_make_node(op, *inputs):
return node[0].owner
else:
return node.owner
def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
bad_runtime = {}, grad = {}, mode = None, grad_rtol=None,
eps = 1e-10, skip = False):
......@@ -146,7 +147,7 @@ def makeTester(name, op, expected, checks = {}, good = {}, bad_build = {},
class Checker(unittest.TestCase):
op = _op
op = staticmethod(_op)
expected = staticmethod(_expected)
checks = _checks
good = _good
......@@ -999,6 +1000,52 @@ SecondSameRankTester = makeTester(
mode=get_default_mode().excluding('local_fill_to_alloc')
)
### Alloc
AllocTester = makeBroadcastTester(
name = 'AllocTester',
op = alloc,
expected = (lambda x, *shp: numpy.zeros(shp, dtype=x.dtype) + x),
good = dict(
correct02 = (rand(), numpy.int32(4), numpy.int32(7)),
correct12 = (rand(7), numpy.int32(4), numpy.int32(7)),
correct13 = (rand(7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
correct23 = (rand(4,7), numpy.int32(2), numpy.int32(4), numpy.int32(7)),
),
bad_runtime = dict(
bad_shape12 = (rand(7), numpy.int32(7), numpy.int32(5)),
too_big32 = (rand(6,2,4), numpy.int32(6), numpy.int32(2)),
too_big32b = (rand(6,2,4), numpy.int32(2), numpy.int32(4)),
),
)
# Since not all inputs of Alloc are differentiable, we need different testers
s1, s2, s3 = randint_ranged(1, 13, (3,))
# alloc a scalar into a vector
Alloc01GradTester = makeBroadcastTester(
name = 'Alloc01GradTester',
#op = (lambda self, x: alloc(x, s1)),
op = (lambda x: alloc(x, s1)),
expected = (lambda x: numpy.zeros((s1,), dtype=x.dtype) + x),
grad = dict(
x1 = (rand(),),
x2 = (rand(),),
x3 = (rand(),),
),
)
# alloc a vector into a tensor3
Alloc13GradTester = makeBroadcastTester(
name = 'Alloc13GradTester',
#op = (lambda self, x: alloc(x, s1, s2, s3)),
op = (lambda x: alloc(x, s1, s2, s3)),
expected = (lambda x: numpy.zeros((s1, s2, s3), dtype=x.dtype) + x),
grad = dict(
x1 = (rand(s3),),
x2 = (rand(s3),),
x3 = (rand(s3),),
),
)
def test_eye():
def check(dtype, N, M_=None, k=0):
# Theano does not accept None as a tensor.
......
......@@ -13,30 +13,33 @@ ops without:
Prod
MulwithoutZeros
ProdWithoutZeros
CAReduce(for max,... done for MaxAndArgmax op)
list of ops that support R-op:
* with test
* SpecifyShape
* MaxAndArgmax
* Subtensor
* IncSubtensor set_subtensor too
* Alloc
* Dot
* Elemwise
* Sum
* Softmax
* Shape
* Join
* without test
* Split
* ARange
* ScalarFromTensor
* Shape
* SpecifyShape
* MaxAndArgmax
* Subtensor
* IncSubtensor
* Rebroadcast
* Join
* Reshape
* Flatten
* AdvancedSubtensor1
* AdvancedIncSubtensor1
* AdvancedIncSubtensor
* Dot
* DimShuffle
* Elemwise
* Sum
* Softmax
* Scan
......@@ -183,11 +186,17 @@ class test_RopLop(unittest.TestCase):
self.in_shape)
def test_max_argmax(self):
def test_max(self):
## If we call max directly, we will return an CAReduce object
## and he don't have R_op implemented!
#self.check_mat_rop_lop(TT.max(self.mx, axis=[0,1])[0],
# ())
self.check_mat_rop_lop(TT.max(self.mx, axis=0),
(self.mat_in_shape[1],))
self.check_mat_rop_lop(TT.max(self.mx, axis=1),
(self.mat_in_shape[0],))
def test_max_argmax(self):
def test_argmax(self):
self.check_nondiff_rop(TT.argmax(self.mx,axis=1))
def test_subtensor(self):
......@@ -201,7 +210,7 @@ class test_RopLop(unittest.TestCase):
self.check_rop_lop(out, self.in_shape)
def test_incsubtensor1(self):
def test_incsubtensor2(self):
tv = numpy.asarray( self.rng.uniform(size=(10,)),
theano.config.floatX)
t = theano.shared(tv)
......@@ -217,7 +226,7 @@ class test_RopLop(unittest.TestCase):
self.check_rop_lop(out, self.in_shape)
def test_setsubtensor1(self):
def test_setsubtensor2(self):
tv = numpy.asarray( self.rng.uniform(size=(10,)),
theano.config.floatX)
t = theano.shared(tv)
......
......@@ -4,15 +4,31 @@ This is a REALLY PARTIAL TEST.
I did them to help debug stuff.
"""
import logging
import StringIO
import theano
import theano.tensor as tensor
def test_pydotprint_cond_highlight():
assert len(theano.theano_logger.handlers) == 1
x = tensor.dvector()
f = theano.function([x], x*2)
f([1,2,3,4])
theano.printing.pydotprint(f, cond_highlight = True)
s = StringIO.StringIO()
new_handler = logging.StreamHandler(s)
new_handler.setLevel(logging.DEBUG)
orig_handler = theano.theano_logger.handlers[0]
theano.theano_logger.removeHandler(orig_handler)
theano.theano_logger.addHandler(new_handler)
try:
theano.printing.pydotprint(f, cond_highlight = True)
finally:
theano.theano_logger.addHandler(orig_handler)
theano.theano_logger.removeHandler(new_handler)
assert s.getvalue() == 'pydotprint: cond_highlight is set but there is no IfElse node in the graph\n'
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论