提交 2fb51c86 authored 作者: Frederic Bastien's avatar Frederic Bastien

merge

......@@ -150,4 +150,11 @@ Config Attributes
A directory with bin/, lib/, include/ folders containing cuda utilities.
.. attribute:: gcc.cxxflags
Default: ""
Extra parameters to pass to gcc when compiling. Extra include paths,
library paths, configuration options, etc.
==================
Advanced Indexing
==================
Continue the Advanced Indexing project that is on either github or bitbucket.
Conditional Evaluation
=======================
This has been discussed informally lots. Modify the linker to look for a
lazy_perform function in each op, that can either return the fact that
computations are done, or it can return a list of inputs that must first be
evaluated before it can proceed.
========================
DP Instruction Selection
========================
Read Ch 9 of Modern Compiler Implementation about instruction selection.
We should probably be doing graph optimization totally differently:
Optimizations *only add* new ways of implementing something, they do not replace
the old way. Every graph node (apply) as a cost, and Dynamic Programming (DP)
is used to select the minimum cost graph.
The advantage of this approach is that optimizations do not have to run in such
a careful order, and graph selection would be much faster.
Think about how aliasing and destructive operations (the destroy-handler) would
fit in this approach.
=====================
Intermediate Language
=====================
It would be nice to be able to use Theano from other languages.
This requires two things: a way to communicate the expression to the theano
compiler, and a way to pass data to and from the compiled function.
One way to do this would be define a textual representation of theano graphs.
A Scheme-like language seems appropriate. Perhaps just scheme would be
appropriate.
How to pass shared variables?
=================
MongoDB DLL Cache
=================
In network environments (like at DIRO on NFS3), a distributed DB like mongo or couch is faster and more
robust to concurrency than the $HOME/.theano. Also, a single cache could be
shared by multiple users. This would result in less compilation time, for
everyone, and less stale-cache problems.
......@@ -4,6 +4,7 @@ Automatic updates
.. note:
Proposed 2010 01 13
Done 2010 04 ??
The Module version of RandomStreams could arrange for the automatic update of
......
=======
OpenCL
=======
Migrate the GPU code-generators to the PyCUDA style, and eventually to OpenCL.
This means mainly to use a different kind of code-generation strategy. The
kernel itself is compiled, but the calling code remains in python or cython. We
would no longer generate entire C files this way, and no longer use the CLinker
for GPU code.
Proactive Merging
=================
Merge is done now as an optimization.
But if Merging was done at graph construction time, things like #476 would work.
Additionally, memo-izing at graph construction time would make it possible to
define recursive formula with recursive python functions (e.g. Fibonacci).
Currently the merge optimization would make the Fibonacci series linear, but the
size of the program used to express the program would be exponential.
========================
Add tensor attributes
========================
Size, shape, psd, symmetric, triangular, contiguous.
Add these attributes to the TensorType with the option always that they be
'unknown'.
Add attributes that are useful for optimizations, or useful for code generation.
......@@ -104,6 +104,47 @@ class BadCLinkerOutput(DebugModeError):
print >> sio, " val_py :", self.val_py
print >> sio, " val_c :", self.val_c
print >> sio, " op :", self.offending_op()
try:
ssio = StringIO()
print >> ssio, " PyValue shape, dtype, strides, min, max:",
print >> ssio, self.val_py.shape,
print >> ssio, self.val_py.dtype,
print >> ssio, self.val_py.strides
print >> ssio, self.val_py.min(),
print >> ssio, self.val_py.max(),
# only if all succeeds to we add anything to sio
print >> sio, ssio.getvalue()
except:
pass
try:
ssio = StringIO()
print >> ssio, " CValue shape, dtype, strides, min, max:",
print >> ssio, self.val_c.shape,
print >> ssio, self.val_c.dtype,
print >> ssio, self.val_c.strides,
print >> ssio, self.val_c.min(),
print >> ssio, self.val_c.max(),
# only if all succeeds to we add anything to sio
print >> sio, ssio.getvalue()
except:
pass
try:
ov=numpy.asarray(self.val_c)
nv=numpy.asarray(self.val_py)
ssio = StringIO()
print >> ssio, " Max Abs Diff: ", numpy.max(numpy.absolute(nv-ov))
print >> ssio, " Mean Abs Diff: ", numpy.mean(numpy.absolute(nv-ov))
print >> ssio, " Median Abs Diff: ", numpy.median(numpy.absolute(nv-ov))
print >> ssio, " Std Abs Diff: ", numpy.std(numpy.absolute(nv-ov))
reldiff = numpy.absolute(nv-ov) / (numpy.absolute(nv)+numpy.absolute(ov))
print >> ssio, " Max Rel Diff: ", numpy.max(reldiff)
print >> ssio, " Mean Rel Diff: ", numpy.mean(reldiff)
print >> ssio, " Median Rel Diff: ", numpy.median(reldiff)
print >> ssio, " Std Rel Diff: ", numpy.std(reldiff)
# only if all succeeds to we add anything to sio
print >> sio, ssio.getvalue()
except:
pass
return sio.getvalue()
class BadOptimization(DebugModeError):
......
......@@ -858,6 +858,7 @@ class CLinker(link.Linker):
if config.gcc.cxxflags:
sig.append(config.gcc.cxxflags)
error_on_play = [False]
def in_sig(i, topological_pos, i_idx):
# assert that every input to every node is one of'
# - an env input
......@@ -868,8 +869,15 @@ class CLinker(link.Linker):
# yield a 'position' that reflects its role in code_gen()
if isinstance(i, graph.Constant): #orphans
if id(i) not in constant_ids:
constant_ids[id(i)] = (i.signature(), topological_pos, i_idx)
isig = constant_ids[id(i)]
isig = (i.signature(), topological_pos, i_idx)
try:
hash(isig)
except: #generic constants don't have a hashable signature
error_on_play[0] = True
return None
constant_ids[id(i)] = isig
else:
isig = constant_ids[id(i)]
#print 'SIGNATURE', i.signature()
#return i.signature()
elif i in env_inputs_dict: #inputs
......@@ -903,6 +911,12 @@ class CLinker(link.Linker):
for ipos,i in enumerate(node.inputs)),
tuple(o in no_recycling for o in node.outputs)))
if error_on_play[0]:
# if one of the signatures is not hashable
# then bypass the cache mechanism and
# compile fresh every time
return None
op_pos[node] = node_pos
env_computed_set.update(node.outputs)
......
......@@ -7,6 +7,7 @@ if sys.version_info[:2] >= (2,5):
import toolbox
import graph
from theano.gof import deque
from env import InconsistencyError
......@@ -44,6 +45,92 @@ class DestroyHandler(object):
return self.map[env].orderings(env)
def _dfs_toposort(i, r_out, orderings):
"""
i - list of inputs
o - list of outputs
orderings - dict of additions to the normal inputs and outputs
Returns nothing. Raises exception for graph with cycles
"""
#this is hard-coded reimplementation of functions from graph.py
# reason: go faster, prepare for port to C.
assert isinstance(r_out, (tuple, list, deque))
# TODO: For more speed - use a defaultdict for the orderings
iset = set(i)
if 0:
def expand(obj):
rval = []
if obj not in iset:
if isinstance(obj, graph.Variable):
if obj.owner:
rval = [obj.owner]
if isinstance(obj, graph.Apply):
rval = list(obj.inputs)
rval.extend(orderings.get(obj, []))
else:
assert not orderings.get(obj, [])
return rval
expand_cache = {}
# reachable, clients = stack_search( deque(r_out), deps, 'dfs', True)
start=deque(r_out)
rval_set = set()
rval_set.add(id(None))
rval_list = list()
expand_inv = {}
sources = deque()
while start:
l = start.pop()# this makes the search dfs
if id(l) not in rval_set:
rval_list.append(l)
rval_set.add(id(l))
if l in iset:
assert not orderings.get(l, [])
expand_l = []
else:
try:
if l.owner:
expand_l = [l.owner]
else:
expand_l = []
except AttributeError:
expand_l = list(l.inputs)
expand_l.extend(orderings.get(l, []))
if expand_l:
for r in expand_l:
expand_inv.setdefault(r, []).append(l)
start.extend(expand_l)
else:
sources.append(l)
expand_cache[l] = expand_l
assert len(rval_list) == len(rval_set)-1
rset = set()
rlist = []
while sources:
node = sources.popleft()
if node not in rset:
rlist.append(node)
rset.add(node)
for client in expand_inv.get(node, []):
expand_cache[client] = [a for a in expand_cache[client] if a is not node]
if not expand_cache[client]:
sources.append(client)
if len(rlist) != len(rval_list):
raise ValueError('graph contains cycles')
#return [o for o in rlist if isinstance(o, graph.Apply)]
def getroot(r, view_i):
"""
For views: Return non-view variable which is ultimatly viewed by r.
......@@ -303,7 +390,8 @@ class DestroyHandlerHelper2(toolbox.Bookkeeper):
raise
#print 'orderings:', ords
try:
graph.io_toposort(env.inputs, env.outputs, ords)
### graph.io_toposort(env.inputs, env.outputs, ords)
_dfs_toposort(env.inputs, env.outputs, ords)
except ValueError, e:
#print 'not passing.', ords
if 'cycles' in str(e):
......
......@@ -9,9 +9,10 @@ compatible with `gof`'s :doc:`graph` routines.
__docformat__ = "restructuredtext en"
import utils
import traceback
from theano import config
class CLinkerObject(object):
"""Standard elements of an Op or Type used with the CLinker
"""
......@@ -320,9 +321,7 @@ class PureOp(object):
"""
node = self.make_node(*inputs, **kwargs)
limit = config.traceback.limit
if limit == -1: limit = None
node.tag.trace = traceback.extract_stack(limit=limit)[:-1]
self.add_tag_trace(node)
if self.default_output is not None:
return node.outputs[self.default_output]
else:
......@@ -331,6 +330,9 @@ class PureOp(object):
else:
return node.outputs
# Convenience so that subclass implementers don't have to import utils
# just to self.add_tag_trace
add_tag_trace = staticmethod(utils.add_tag_trace)
#########################
# Python implementation #
......
......@@ -127,6 +127,8 @@ class SeqOptimizer(Optimizer, list):
for optimizer in self:
try:
optimizer.optimize(env)
except AssertionError: # do not catch Assertion failures
raise
except Exception, e:
if self.failure_callback:
self.failure_callback(e, self, optimizer)
......
......@@ -5,8 +5,7 @@ __docformat__ = "restructuredtext en"
import copy
import utils
from utils import MethodNotDefined, object2
from graph import Variable
import traceback
import graph
from theano import config
########
......@@ -202,6 +201,9 @@ class PureType(object):
"""
Variable = graph.Variable #the type that will be created by call to make_variable.
Constant = graph.Constant #the type that will be created by call to make_constant
def filter(self, data, strict = False):
"""Required: Return data or an appropriately wrapped/converted data.
......@@ -233,8 +235,11 @@ class PureType(object):
A pretty string for printing and debugging.
"""
r = Variable(self, name = name)
return r
return self.Variable(self, name = name)
def make_constant(self, value, name=None):
return self.Constant(type=self, data=value, name=name)
def __call__(self, name = None):
"""Return a new `Variable` instance of Type `self`.
......@@ -244,11 +249,7 @@ class PureType(object):
A pretty string for printing and debugging.
"""
r = self.make_variable(name)
limit = config.traceback.limit
if limit == -1: limit = None
r.tag.trace = traceback.extract_stack(limit=limit)[:-1]
return r
return utils.add_tag_trace(self.make_variable(name))
def values_eq(self, a, b):
"""
......@@ -319,9 +320,11 @@ class Type(object2, PureType, CLinkerType):
"""
## DELETEME ##
class SingletonType(Type):
"""WRITEME"""
"""Convenient Base class for a Type subclass with no attributes
It saves having to implement __eq__ and __hash__
"""
__instance = None
def __new__(cls):
if cls.__instance is None:
......@@ -378,14 +381,5 @@ class Generic(SingletonType):
Py_INCREF(py_%(name)s);
""" % locals()
generic = Generic()
## DELETEME ##
class PropertiedType(Type):
"""WRITEME"""
def __eq__(self, other):
return type(self) == type(other) and self.__dict__ == other.__dict__
......@@ -2,7 +2,18 @@
# import op
# import variable
import re, os
from theano import config
import re, os, traceback
def add_tag_trace(thing):
"""Add tag.trace to an node or variable.
The argument is returned after being affected (inplace).
"""
limit = config.traceback.limit
if limit == -1: limit = None
thing.tag.trace = traceback.extract_stack(limit=limit)[:-1]
return thing
def hashgen():
hashgen.next += 1
......
......@@ -53,6 +53,10 @@ def debugprint(obj, depth=-1, print_type=False, file=None):
results_to_print.extend(obj.outputs)
elif isinstance(obj, Function):
results_to_print.extend(obj.maker.env.outputs)
elif isinstance(obj, (list, tuple)):
results_to_print.extend(obj)
else:
raise TypeError("debugprint cannot print an object of this type", obj)
for r in results_to_print:
debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file)
if file is _file:
......
......@@ -149,18 +149,25 @@ class GpuGemm(Op):
"""
implement the gemm on the gpu.
..note: This probably don't work correctly for no_inplace gemm.
Need to check al least refcount.
"""
destroy_map = {0:[0]}
def __init__(self, inplace):
self.inplace = inplace
if self.inplace:
self.destroy_map = {0:[0]}
def __str__(self):
return 'GpuGemm'
if self.inplace:
return 'GpuGemm{inplace}'
else:
return 'GpuGemm{no_inplace}'
def __eq__(self, other):
return type(self) == type(other)
return type(self) == type(other) and self.inplace == other.inplace
def __hash__(self):
return hash(type(self))
return hash(type(self)) ^ hash(self.inplace)
def __setstate__(self, dct):
self.inplace = dct.get('inplace', True)
def make_node(self, z, a, x, y, b):
# the more complicated error checking performed by tensor.gemm is assumed to already
......@@ -168,13 +175,16 @@ class GpuGemm(Op):
return Apply(self, [z, a, x, y, b], [z.type()])
def c_code_cache_version(self):
return ()
return (2,)
def c_code(self, node, name, inputs, outputs, sub):
z_in, a, x, y, b = inputs
z_out, = outputs
fail = sub['fail']
return """
sio = StringIO.StringIO()
print >> sio, """
#define REAL float
float %(name)s_a = (%(a)s->descr->type_num == PyArray_FLOAT)
......@@ -186,15 +196,48 @@ class GpuGemm(Op):
: (REAL)(((double*)%(b)s->data)[0]);
#undef REAL
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_in)s))
"""
if self.inplace:
print >> sio, """
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
"""
else:
print >> sio, """
if (!%(z_out)s
|| (%(z_out)s->nd != 2)
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[0] != CudaNdarray_HOST_DIMS(%(z_in)s)[0])
|| (CudaNdarray_HOST_DIMS(%(z_out)s)[1] != CudaNdarray_HOST_DIMS(%(z_in)s)[1])
)
{
Py_XDECREF(%(z_out)s);
%(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
if (!%(z_out)s)
{
%(fail)s;
}
}
else
{
if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
{
%(fail)s;
}
}
"""
print >> sio, """
if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
{
%(fail)s;
}
Py_XDECREF(%(z_out)s);
%(z_out)s = %(z_in)s;
Py_INCREF(%(z_out)s);
""" % locals()
gpu_gemm = GpuGemm()
"""
return sio.getvalue() % locals()
gpu_gemm_no_inplace = GpuGemm(inplace=False)
gpu_gemm_inplace = GpuGemm(inplace=True)
##
# Not really a BLAS operation, but whatever.
......
......@@ -533,19 +533,19 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
if(PyTuple_Check(shape)){
for (int i = 0; i < rval_nd; ++i)
{
rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
if (PyErr_Occurred()) //error in AsLong
{
free(rval_dims);
{
rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
if (PyErr_Occurred()) //error in AsLong
{
free(rval_dims);
return NULL;
}
if(rval_dims[i]<=0){
PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >0)",rval_dims[i]);
free(rval_dims);
return NULL;
}
rval_size = rval_size * rval_dims[i];
}
if(rval_dims[i]<=0){
PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >0)",rval_dims[i]);
free(rval_dims);
return NULL;
}
rval_size = rval_size * rval_dims[i];
}
}else{
rval_size = PyInt_AsLong(shape);
......@@ -558,6 +558,10 @@ PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
free(rval_dims);
return NULL;
}
if (rval_size==0)
{
return CudaNdarray_NewDims(rval_nd, rval_dims);
}
if(CudaNdarray_is_c_contiguous(self))
{
......@@ -1019,6 +1023,44 @@ CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
Py_INCREF(py_self);
return py_self;
}
case 5:
{
dim3 n_blocks(
std::min(CudaNdarray_HOST_DIMS(self)[1], NUM_VECTOR_OP_BLOCKS),
CudaNdarray_HOST_DIMS(self)[2]
);
while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS) n_blocks.y /= 2;
dim3 n_threads(
std::min(CudaNdarray_HOST_DIMS(self)[3], NUM_VECTOR_OP_THREADS_PER_BLOCK)
);
for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
{
k_iAdd_4<<<n_blocks, n_threads>>>(
CudaNdarray_HOST_DIMS(self)[1],
CudaNdarray_HOST_DIMS(self)[2],
CudaNdarray_HOST_DIMS(self)[3],
CudaNdarray_HOST_DIMS(self)[4],
CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
CudaNdarray_HOST_STRIDES(self)[1],
CudaNdarray_HOST_STRIDES(self)[2],
CudaNdarray_HOST_STRIDES(self)[3],
CudaNdarray_HOST_STRIDES(self)[4],
CudaNdarray_DEV_DATA(other) + i * CudaNdarray_HOST_STRIDES(other)[0],
CudaNdarray_HOST_STRIDES(other)[1],
CudaNdarray_HOST_STRIDES(other)[2],
CudaNdarray_HOST_STRIDES(other)[3],
CudaNdarray_HOST_STRIDES(other)[4]);
CNDA_THREAD_SYNC;
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "k_iAdd", cudaGetErrorString(err));
return NULL;
}
}
Py_INCREF(py_self);
return py_self;
}
}
PyErr_Format(PyExc_NotImplementedError, "inplace_add w nd=%i\n", self->nd);
......
......@@ -2,12 +2,13 @@ import sys
import theano
import numpy
from theano import scalar as scal
from theano import tensor, compile
from theano import tensor, compile, gof
from theano.gof import local_optimizer, EquilibriumDB, SequenceDB, Optimizer, toolbox, DestroyHandler
from theano.sandbox.cuda.basic_ops import *
from theano.sandbox.cuda.type import CudaNdarrayType
from theano.sandbox.cuda.blas import gpu_dot22, gpu_dot22scalar, gpu_gemm, GpuConv
from theano.sandbox.cuda.blas import (gpu_dot22, gpu_dot22scalar, gpu_gemm_inplace,
gpu_gemm_no_inplace, GpuConv)
from theano.sandbox.cuda.blas import GpuDownsampleFactorMax, GpuDownsampleFactorMaxGrad
from theano.sandbox.cuda.nnet import (
GpuCrossentropySoftmaxArgmax1HotWithBias,
......@@ -191,18 +192,21 @@ def local_gpu_gemm(node):
gemm(host_from_gpu) -> host_from_gpu(gpu_gemm)
"""
gemms = {tensor.blas.gemm_inplace: gpu_gemm_inplace,
tensor.blas.gemm_no_inplace: gpu_gemm_no_inplace}
if node.op == gpu_from_host:
host_input = node.inputs[0]
if host_input.owner and host_input.owner.op == tensor.blas.gemm_inplace:
if host_input.owner and host_input.owner.op in gemms:
op = host_input.owner.op
z, a, x, y, b = host_input.owner.inputs
return [gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
if node.op == tensor.blas.gemm_inplace:
return [gemms[op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b)]
if node.op in gemms:
z, a, x, y, b = node.inputs
x_on_gpu = (x.owner and x.owner.op == host_from_gpu)
y_on_gpu = (y.owner and y.owner.op == host_from_gpu)
z_on_gpu = (z.owner and z.owner.op == host_from_gpu)
if x_on_gpu or y_on_gpu or z_on_gpu:
return [host_from_gpu(gpu_gemm(gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return [host_from_gpu(gemms[node.op](gpu_from_host(z), a, gpu_from_host(x), gpu_from_host(y), b))]
return False
@register_opt()
......@@ -224,6 +228,33 @@ def local_gpu_sum(node):
if hasattr(gsum, 'c_code_reduce_%s'%pattern):
return [host_from_gpu(gsum(gpu_from_host(x)))]
else:
# Try to make a simpler pattern based on reshaping
# The principle is that if two adjacent dimensions have the same value in
# the reduce_mask, then we can reshape to make them a single dimension, do
# the sum, and then reshape to get them back.
shape_of = node.env.shape_feature.shape_of
x_shape = shape_of[x]
new_in_shp = [x_shape[0]]
new_mask = [reduce_mask[0]]
for i in range(1, x.type.ndim):
if reduce_mask[i] == reduce_mask[i-1]:
new_in_shp[-1] *= x_shape[i]
else:
new_mask.append(reduce_mask[i])
new_in_shp.append(x_shape[i])
pattern=(''.join(str(i) for i in new_mask))
new_gsum = GpuSum(new_mask)
if hasattr(new_gsum, 'c_code_reduce_%s'%pattern):
reshaped_x = x.reshape(tensor.stack(*new_in_shp))
sum_reshaped_x = host_from_gpu(new_gsum(gpu_from_host(reshaped_x)))
unreshaped_sum = sum_reshaped_x.reshape(tensor.stack(*shape_of[node.outputs[0]]))
return [unreshaped_sum]
raise Exception("GpuSum don't have implemented the pattern",pattern)
return False
......
......@@ -34,11 +34,13 @@ def test_sum():
test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0011, 0111, 1011, 1111
TODO: test with broadcast
"""
for shape, pattern in [((0,),[0]),((5,),[0]),
for shape, pattern in [((100,3,1300),[1]),
((0,),[0]),((5,),[0]),
((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
((0,0,0,0),[0,1,2,3]),
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3])]:
((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
((5,4,3,10,11),[1,2])]:
a = tensor.TensorType('float32',(False,)*len(shape))()
b = T.Sum(pattern)(a)
val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
......@@ -50,9 +52,9 @@ def test_sum():
assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
if val.size==0:
assert f2(val)==f(val)
assert f2(val)==f(val), ('shape', shape, 'pattern', pattern)
else:
assert numpy.allclose(f2(val),f(val))
assert numpy.allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern)
#test with dimshuffle
......
......@@ -191,25 +191,31 @@ class Scalar(Type):
typedef theano_complex%(nbits)s complex_type;
typedef npy_float%(half_nbits)s scalar_type;
complex_type operator +(complex_type y) {
complex_type operator +(const complex_type &y) const {
complex_type ret;
ret.real = this->real + y.real;
ret.imag = this->imag + y.imag;
return ret;
}
complex_type operator -(complex_type y) {
complex_type operator -() const {
complex_type ret;
ret.real = -this->real;
ret.imag = -this->imag;
return ret;
}
complex_type operator -(const complex_type &y) const {
complex_type ret;
ret.real = this->real - y.real;
ret.imag = this->imag - y.imag;
return ret;
}
complex_type operator *(complex_type y) {
complex_type operator *(const complex_type &y) const {
complex_type ret;
ret.real = this->real * y.real - this->imag * y.imag;
ret.imag = this->real * y.imag + this->imag * y.real;
return ret;
}
complex_type operator /(complex_type y) {
complex_type operator /(const complex_type &y) const {
complex_type ret;
scalar_type y_norm_square = y.real * y.real + y.imag * y.imag;
ret.real = (this->real * y.real + this->imag * y.imag) / y_norm_square;
......@@ -296,6 +302,7 @@ class Scalar(Type):
return ""
def c_code_cache_version(self):
return (8,) # put const around operators and added unary '-' operator
# no need to put lib.amdlibm here as c_compile_args() are put in the key.
return (7,) # make complex c code optional
return (6,) # added implemeentations of operators that work with scalar arguments
......@@ -459,7 +466,39 @@ def float_out_nocomplex(*types):
if type in complex_types:
raise TypeError('complex argument not supported')
return float64,
class unary_out_lookup(gof.utils.object2):
"""
get a output_types_preference object by passing a dictionary:
unary_out_lookup({int8:int32, float32:complex128})
The result is an op that maps in8 to int32 and float32 to complex128 and other input types
lead to a TypeError.
"""
def __init__(self, type_table):
self.tbl = type_table
def __call__(self, *types):
if len(types) == 1:
types = types[0]
try:
rval = self.tbl[types]
except:
raise TypeError(types)
if isinstance(types, (list, tuple)):
return rval
else:
return [rval]
def __eq__(self, other):
return type(self) == type(other) and self.tbl == other.tbl
def __hash__(self):
return hash(type(self)) # ignore hash of table
def real_out(type):
if type == complex64:
return float32,
if type == complex128:
return float64,
return type,
class ScalarOp(Op):
......@@ -810,7 +849,14 @@ class Mul(ScalarOp):
retval = []
for input in inputs:
if input.type in grad_types:
retval += [cast(mul(*([gz] + utils.difference(inputs, [input]))), input.type.dtype)]
if input.type in complex_types:
# does casting from real to complex work?
dz_dinput = cast(mul(*(utils.difference(inputs, [input]))), input.type.dtype)
x = real(dz_dinput)
y = imag(dz_dinput)
retval += [complex(x*real(gz)+y*imag(gz), x*imag(gz)-y*real(gz))]
else:
retval += [cast(mul(*([gz] + utils.difference(inputs, [input]))), input.type.dtype)]
else:
retval += [None]
......@@ -1088,11 +1134,9 @@ class Abs(UnaryScalarOp):
return numpy.abs(x)
def grad(self, (x, ), (gz, )):
if x.type in grad_types:
return gz * sgn(x),
return gz * x / abs(x), # formula works for complex and real
else:
return None,
#backport
#return gz * sgn(x) if x.type in grad_types else None,
return None,
def c_code(self, node, name, (x, ), (z, ), sub):
type = node.inputs[0].type
if type in int_types:
......@@ -1478,6 +1522,91 @@ class Tanh(UnaryScalarOp):
return "%(z)s = tanh(%(x)s);" % locals()
tanh = Tanh(upgrade_to_float, name = 'tanh')
class Real(UnaryScalarOp):
"""Extract the real coordinate of a complex number. """
def impl(self, x):
return numpy.real(x)
def grad(self, (x, ), (gz, )):
return [complex(gz, 0)]
real = Real(real_out, name='real')
class Imag(UnaryScalarOp):
def impl(self, x):
return numpy.imag(x)
def grad(self, (x, ), (gz, )):
return [complex(0, gz)]
imag = Imag(real_out, name='imag')
class Angle(UnaryScalarOp):
def impl(self, x):
return numpy.angle(x)
def grad(self, (c, ), (gtheta, )):
# y = x.imag
# r = sqrt(y**2 + x.real**2)
# g = y/r
# if x == 0 and y == 0:
# theta = 0
# elif x >= 0:
# theta = numpy.arcsin(g)
# else:
# theta = -numpy.arcsin(g)+numpy.pi
x = real(c)
y = imag(c)
r = abs(c)
gr = -gtheta * y / (r**2 * sqrt(1 - (y/r)**2))
gx = gr * x/r
gy = gr * y/r
return [complex(gx, gy)]
angle = Angle(specific_out(float64), name='angle')
class Complex(BinaryScalarOp):
@staticmethod
def output_types_preference(x,y):
if x in complex_types:
raise TypeError(x)
if y in complex_types:
raise TypeError(y)
up = Scalar.upcast(x, y)
if up in ('float64', 'int64', 'uint64', 'int32', 'uint32'):
return [complex128]
else:
return [complex64]
def impl(self, x, y):
return numpy.complex(x, y)
def grad(self, (x,y), (z,)):
return [real(z), imag(z)]
complex = Complex(name='complex')
class Conj(UnaryScalarOp):
def impl(self, x):
return numpy.conj(x)
def grad(self, (x, ), (gz, )):
return [conj(gz)]
conj = Conj(same_out, name='conj')
class ComplexFromPolar(BinaryScalarOp):
@staticmethod
def output_types_preference(x,y):
return Complex.output_types_preference(x,y)
def impl(self, r, theta):
if r < 0:
raise ValueError('polar radius must be non-negative', r)
x = r*numpy.cos(theta)
y = r*numpy.sin(theta)
if x.dtype == 'float32':
return numpy.complex64(numpy.complex(x,y))
else:
return numpy.complex128(numpy.complex(x,y))
def grad(self, (r,theta), (gz,)):
gr = cos(theta) * real(gz) + sin(theta) * imag(gz)
gtheta = -real(gz) * r * sin(theta) + imag(gz) * r * cos(theta)
return [gr, gtheta]
complex_from_polar = ComplexFromPolar(name='complex_from_polar')
class Composite(ScalarOp):
......@@ -1642,3 +1771,4 @@ class Composite(ScalarOp):
#otherwise self.perform won't work.
self.__init__(self.inputs, self.outputs)
差异被折叠。
......@@ -618,7 +618,7 @@ class Elemwise(Op):
errormsg = 'Failed calling ufunc for op', self.scalar_op,\
'for params of shape', [arg.shape for arg in ufunc_args]
e.args = e.args + errormsg
raise e
raise
if ufunc.nout == 1: variables = [variables]
for variable, storage in zip(variables, output_storage):
if storage[0].shape:
......
......@@ -180,11 +180,12 @@ def local_dimshuffle_lift(node):
new_order = [x == 'x' and 'x' or inode.op.new_order[x] for x in op.new_order]
inplace = op.inplace and inode.op.inplace
iinput = inode.inputs[0]
if new_order == range(len(new_order)):
if new_order == range(len(new_order)) and (len(new_order) == iinput.type.ndim):
return [iinput]
else:
return DimShuffle(iinput.type.broadcastable, new_order, inplace).make_node(iinput).outputs
register_canonicalize(local_dimshuffle_lift)
register_specialize(local_dimshuffle_lift)
......@@ -519,8 +520,8 @@ def local_fill_to_alloc(node):
# TODO: cut out un-necessary dimshuffles of v
rval = [T.alloc(T.cast(v, node.outputs[0].dtype), *shape_of[node.outputs[0]])]
if rval[0].type != node.outputs[0].type:
print >> sys.stderr, theano.printing.debugprint(node.outputs[0], file='str')
#if rval[0].type != node.outputs[0].type:
#print >> sys.stderr, theano.printing.debugprint(node.outputs[0], file='str')
assert rval[0].type == node.outputs[0].type, ('rval', rval[0].type,
'orig', node.outputs[0].type,
......@@ -1048,9 +1049,41 @@ def local_reshape_chain(node):
if not opt.check_chain(node, T.Reshape, T.Reshape):
return False
# TODO: this can permit a failing program to run by eliminating the the lower
# reshape
return [node.op(node.inputs[0].owner.inputs[0], node.inputs[1])]
register_canonicalize(local_reshape_chain)
if 0:
# TODO: Test that this optimziation works.
@register_canonicalize
@gof.local_optimizer([])
def local_scalar_reshape(node):
"""Eliminate reshape Ops whose inputs and outputs are scalars """
if isinstance(node.op, T.Reshape):
x, shp = node.inputs
if x.ndim == 0 and T.get_vector_length(shp)==0:
return [x]
if 0:
# TODO: Finish writing and testing this optimization.
# The idea is that if we can prove the output to this sum
# has a zero-size dimension, then it can be replaced by an appropriately typed and
# broadcasted zero.
@register_canonicalize
@gof.local_optimizer([])
def local_sum_over_empty(node):
if isinstance(node.op, T.Sum):
y, = node.outputs
y_shape = node.env.shape_feature.shape_of[y]
def tmp(thing):
try:
return T.get_constant_value(thing)
except (TypeError, ValueError), e:
print e, thing.owner.inputs[0]
return None
print 'LOCAL SUM EMPTY', [tmp(s) for s in y_shape]
##################
# Middleman cuts #
......@@ -1063,11 +1096,18 @@ def local_fill_cut(node):
If c.type == a.type.
"""
# this optimization is essentially for getting broadcasting to replace fill.
# This is always possible when using a Compound Elemwise operation,
# but it is not always possible without one (consider filling a large matrix with a scalar,
# and then adding another scalar. The only numbers that count are the two scalars, but we
# can't ignore the large matrix because it gives the shape of the result.
if not opt.check_chain(node, T.Elemwise):
return False
output = node.outputs[0]
try:
#reference is some input with the same type as the input but that is not produced by a fill
reference = [input
for input in node.inputs
if input.type == output.type and (not input.owner or input.owner.op != T.fill)][0]
......@@ -1086,7 +1126,12 @@ def local_fill_cut(node):
if new_inputs == node.inputs:
return False
return node.op.make_node(*new_inputs).outputs
rval = node.op(*new_inputs)
if isinstance(rval, gof.Variable):
return rval.owner.outputs
else:
return rval[0].owner.outputs
register_canonicalize(local_fill_cut)
......
import unittest
import theano
from theano.tensor import *
from theano.tests import unittest_tools as utt
class TestRealImag(unittest.TestCase):
......@@ -15,9 +16,60 @@ class TestRealImag(unittest.TestCase):
x= dvector()
rng = numpy.random.RandomState(23)
xval = rng.randn(10)
assert numpy.all( 0 == theano.function([x], imag(x))(xval))
assert numpy.all( xval == theano.function([x], real(x))(xval))
try:
numpy.all( 0 == theano.function([x], imag(x))(xval))
assert 0
except TypeError:
pass
try:
numpy.all( xval == theano.function([x], real(x))(xval))
assert 0
except TypeError:
pass
def test_cast(self):
x= zvector()
self.failUnlessRaises(TypeError, cast, x, 'int32')
def test_complex(self):
rng = numpy.random.RandomState(2333)
m = fmatrix()
c = complex(m[0], m[1])
assert c.type == cvector
r,i = [real(c), imag(c)]
assert r.type == fvector
assert i.type == fvector
f = theano.function([m], [r,i] )
mval = numpy.asarray(rng.randn(2,5), dtype='float32')
rval, ival = f(mval)
assert numpy.all(rval == mval[0]), (rval,mval[0])
assert numpy.all(ival == mval[1]), (ival, mval[1])
def test_complex_grads(self):
def f(m):
c = complex(m[0], m[1])
return .5 * real(c) + .9 * imag(c)
rng = numpy.random.RandomState(9333)
mval = numpy.asarray(rng.randn(2,5))
utt.verify_grad(f, [mval])
def test_polar_grads(self):
def f(m):
c = complex_from_polar(abs(m[0]), m[1])
return .5 * real(c) + .9 * imag(c)
rng = numpy.random.RandomState(9333)
mval = numpy.asarray(rng.randn(2,5))
utt.verify_grad(f, [mval])
def test_abs_grad(self):
def f(m):
c = complex(m[0], m[1])
return .5 * abs(c)
rng = numpy.random.RandomState(9333)
mval = numpy.asarray(rng.randn(2,5))
utt.verify_grad(f, [mval])
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论