提交 c51f5936 authored 作者: Brandon T. Willard's avatar Brandon T. Willard

Replace theano.tensor alias T with tt in theano.tensor sub-package

Indirect references to theano.tensor imports at module level were also converted to direct references in quite a few cases.
上级 2cb3a154
import theano.tensor.basic as tt
from theano import config from theano import config
from theano.gof.params_type import ParamsType from theano.gof.params_type import ParamsType
from theano.scalar import bool as bool_t from theano.scalar import bool as bool_t
...@@ -6,7 +8,6 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version ...@@ -6,7 +8,6 @@ from theano.tensor.blas import ldflags, blas_header_text, blas_header_version
from theano.tensor.blas import blas_optdb, optdb, local_optimizer from theano.tensor.blas import blas_optdb, optdb, local_optimizer
from theano.tensor.blas import Ger, ger, ger_destructive from theano.tensor.blas import Ger, ger, ger_destructive
from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace from theano.tensor.blas import Gemv, gemv_inplace, gemv_no_inplace
from theano.tensor import basic as T
class BaseBLAS(object): class BaseBLAS(object):
...@@ -706,10 +707,10 @@ def make_c_gemv_destructive(node): ...@@ -706,10 +707,10 @@ def make_c_gemv_destructive(node):
dest = inputs[0] dest = inputs[0]
if ( if (
dest.owner dest.owner
and isinstance(dest.owner.op, T.AllocEmpty) and isinstance(dest.owner.op, tt.AllocEmpty)
and len(dest.clients) > 1 and len(dest.clients) > 1
): ):
inputs[0] = T.AllocEmpty(dest.dtype)(*dest.owner.inputs) inputs[0] = tt.AllocEmpty(dest.dtype)(*dest.owner.inputs)
return [cgemv_inplace(*inputs)] return [cgemv_inplace(*inputs)]
......
import numpy as np import numpy as np
import theano.tensor as tt
from theano import gof from theano import gof
import theano.tensor as T
from theano.gradient import DisconnectedType from theano.gradient import DisconnectedType
...@@ -10,10 +12,10 @@ class RFFTOp(gof.Op): ...@@ -10,10 +12,10 @@ class RFFTOp(gof.Op):
def output_type(self, inp): def output_type(self, inp):
# add extra dim for real/imag # add extra dim for real/imag
return T.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim + 1)) return tt.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim + 1))
def make_node(self, a, s=None): def make_node(self, a, s=None):
a = T.as_tensor_variable(a) a = tt.as_tensor_variable(a)
if a.ndim < 2: if a.ndim < 2:
raise TypeError( raise TypeError(
"%s: input must have dimension > 2, with first dimension batches" "%s: input must have dimension > 2, with first dimension batches"
...@@ -22,10 +24,10 @@ class RFFTOp(gof.Op): ...@@ -22,10 +24,10 @@ class RFFTOp(gof.Op):
if s is None: if s is None:
s = a.shape[1:] s = a.shape[1:]
s = T.as_tensor_variable(s) s = tt.as_tensor_variable(s)
else: else:
s = T.as_tensor_variable(s) s = tt.as_tensor_variable(s)
if s.dtype not in T.integer_dtypes: if s.dtype not in tt.integer_dtypes:
raise TypeError( raise TypeError(
"%s: length of the transformed axis must be" "%s: length of the transformed axis must be"
" of type integer" % self.__class__.__name__ " of type integer" % self.__class__.__name__
...@@ -54,7 +56,7 @@ class RFFTOp(gof.Op): ...@@ -54,7 +56,7 @@ class RFFTOp(gof.Op):
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
+ [slice(None)] + [slice(None)]
) )
gout = T.set_subtensor(gout[idx], gout[idx] * 0.5) gout = tt.set_subtensor(gout[idx], gout[idx] * 0.5)
return [irfft_op(gout, s), DisconnectedType()()] return [irfft_op(gout, s), DisconnectedType()()]
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -71,10 +73,10 @@ class IRFFTOp(gof.Op): ...@@ -71,10 +73,10 @@ class IRFFTOp(gof.Op):
def output_type(self, inp): def output_type(self, inp):
# remove extra dim for real/imag # remove extra dim for real/imag
return T.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim - 1)) return tt.TensorType(inp.dtype, broadcastable=[False] * (inp.type.ndim - 1))
def make_node(self, a, s=None): def make_node(self, a, s=None):
a = T.as_tensor_variable(a) a = tt.as_tensor_variable(a)
if a.ndim < 3: if a.ndim < 3:
raise TypeError( raise TypeError(
"%s: input must have dimension >= 3, with " % self.__class__.__name__ "%s: input must have dimension >= 3, with " % self.__class__.__name__
...@@ -83,11 +85,11 @@ class IRFFTOp(gof.Op): ...@@ -83,11 +85,11 @@ class IRFFTOp(gof.Op):
if s is None: if s is None:
s = a.shape[1:-1] s = a.shape[1:-1]
s = T.set_subtensor(s[-1], (s[-1] - 1) * 2) s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2)
s = T.as_tensor_variable(s) s = tt.as_tensor_variable(s)
else: else:
s = T.as_tensor_variable(s) s = tt.as_tensor_variable(s)
if s.dtype not in T.integer_dtypes: if s.dtype not in tt.integer_dtypes:
raise TypeError( raise TypeError(
"%s: length of the transformed axis must be" "%s: length of the transformed axis must be"
" of type integer" % self.__class__.__name__ " of type integer" % self.__class__.__name__
...@@ -117,7 +119,7 @@ class IRFFTOp(gof.Op): ...@@ -117,7 +119,7 @@ class IRFFTOp(gof.Op):
+ [slice(1, (s[-1] // 2) + (s[-1] % 2))] + [slice(1, (s[-1] // 2) + (s[-1] % 2))]
+ [slice(None)] + [slice(None)]
) )
gf = T.set_subtensor(gf[idx], gf[idx] * 2) gf = tt.set_subtensor(gf[idx], gf[idx] * 2)
return [gf, DisconnectedType()()] return [gf, DisconnectedType()()]
def connection_pattern(self, node): def connection_pattern(self, node):
...@@ -157,7 +159,7 @@ def rfft(inp, norm=None): ...@@ -157,7 +159,7 @@ def rfft(inp, norm=None):
cond_norm = _unitary(norm) cond_norm = _unitary(norm)
scaling = 1 scaling = 1
if cond_norm == "ortho": if cond_norm == "ortho":
scaling = T.sqrt(s.prod().astype(inp.dtype)) scaling = tt.sqrt(s.prod().astype(inp.dtype))
return rfft_op(inp, s) / scaling return rfft_op(inp, s) / scaling
...@@ -196,9 +198,9 @@ def irfft(inp, norm=None, is_odd=False): ...@@ -196,9 +198,9 @@ def irfft(inp, norm=None, is_odd=False):
s = inp.shape[1:-1] s = inp.shape[1:-1]
if is_odd: if is_odd:
s = T.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1) s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2 + 1)
else: else:
s = T.set_subtensor(s[-1], (s[-1] - 1) * 2) s = tt.set_subtensor(s[-1], (s[-1] - 1) * 2)
cond_norm = _unitary(norm) cond_norm = _unitary(norm)
scaling = 1 scaling = 1
...@@ -206,7 +208,7 @@ def irfft(inp, norm=None, is_odd=False): ...@@ -206,7 +208,7 @@ def irfft(inp, norm=None, is_odd=False):
if cond_norm is None: if cond_norm is None:
scaling = s.prod().astype(inp.dtype) scaling = s.prod().astype(inp.dtype)
elif cond_norm == "ortho": elif cond_norm == "ortho":
scaling = T.sqrt(s.prod().astype(inp.dtype)) scaling = tt.sqrt(s.prod().astype(inp.dtype))
return irfft_op(inp, s) / scaling return irfft_op(inp, s) / scaling
......
import numpy as np import numpy as np
import theano import theano
import theano.tensor.basic as tt
from theano import Apply, Op from theano import Apply, Op
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.opt import copy_stack_trace from theano.gof.opt import copy_stack_trace
from theano.tensor import as_tensor_variable, TensorType from theano.scalar import Composite, add, as_common_dtype, mul, sub, true_div
from theano.tensor import basic as T from theano.tensor import TensorType, as_tensor_variable
from theano.tensor.elemwise import Elemwise
from theano.tensor.opt import register_specialize_device from theano.tensor.opt import register_specialize_device
from theano.scalar import Composite, as_common_dtype
from theano.scalar import add, sub, true_div, mul
class BNComposite(Composite): class BNComposite(Composite):
...@@ -72,9 +74,7 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"): ...@@ -72,9 +74,7 @@ def batch_normalization(inputs, gamma, beta, mean, std, mode="low_mem"):
between implementation is likely to be less important on the full model fprop/bprop. between implementation is likely to be less important on the full model fprop/bprop.
""" """
if mode == "low_mem": if mode == "low_mem":
elm_bn = theano.tensor.elemwise.Elemwise( elm_bn = Elemwise(scalar_op=BNComposite(dtype=inputs.dtype))
scalar_op=BNComposite(dtype=inputs.dtype)
)
rval = elm_bn(inputs, mean, std, gamma, beta) rval = elm_bn(inputs, mean, std, gamma, beta)
elif mode == "high_mem": elif mode == "high_mem":
rval = (inputs - mean) * (gamma / std) + beta rval = (inputs - mean) * (gamma / std) + beta
...@@ -239,8 +239,8 @@ def batch_normalization_train( ...@@ -239,8 +239,8 @@ def batch_normalization_train(
gamma = gamma.dimshuffle(params_dimshuffle_pattern) gamma = gamma.dimshuffle(params_dimshuffle_pattern)
beta = beta.dimshuffle(params_dimshuffle_pattern) beta = beta.dimshuffle(params_dimshuffle_pattern)
else: else:
gamma = T.addbroadcast(gamma, *axes) gamma = tt.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes) beta = tt.addbroadcast(beta, *axes)
batchnorm_op = AbstractBatchNormTrain(axes=axes) batchnorm_op = AbstractBatchNormTrain(axes=axes)
...@@ -251,8 +251,8 @@ def batch_normalization_train( ...@@ -251,8 +251,8 @@ def batch_normalization_train(
running_mean = running_mean.dimshuffle(params_dimshuffle_pattern) running_mean = running_mean.dimshuffle(params_dimshuffle_pattern)
running_var = running_var.dimshuffle(params_dimshuffle_pattern) running_var = running_var.dimshuffle(params_dimshuffle_pattern)
else: else:
running_mean = T.addbroadcast(running_mean, *axes) running_mean = tt.addbroadcast(running_mean, *axes)
running_var = T.addbroadcast(running_var, *axes) running_var = tt.addbroadcast(running_var, *axes)
out, mean, invstd, new_running_mean, new_running_var = batchnorm_op( out, mean, invstd, new_running_mean, new_running_var = batchnorm_op(
inputs, inputs,
gamma, gamma,
...@@ -263,11 +263,11 @@ def batch_normalization_train( ...@@ -263,11 +263,11 @@ def batch_normalization_train(
running_var=running_var, running_var=running_var,
) )
if new_running_mean.broadcastable != running_mean.broadcastable: if new_running_mean.broadcastable != running_mean.broadcastable:
new_running_mean = T.patternbroadcast( new_running_mean = tt.patternbroadcast(
new_running_mean, running_mean.broadcastable new_running_mean, running_mean.broadcastable
) )
if new_running_var.broadcastable != running_var.broadcastable: if new_running_var.broadcastable != running_var.broadcastable:
new_running_var = T.patternbroadcast( new_running_var = tt.patternbroadcast(
new_running_var, running_var.broadcastable new_running_var, running_var.broadcastable
) )
results = (out, mean, invstd, new_running_mean, new_running_var) results = (out, mean, invstd, new_running_mean, new_running_var)
...@@ -376,10 +376,10 @@ def batch_normalization_test( ...@@ -376,10 +376,10 @@ def batch_normalization_test(
mean = mean.dimshuffle(params_dimshuffle_pattern) mean = mean.dimshuffle(params_dimshuffle_pattern)
var = var.dimshuffle(params_dimshuffle_pattern) var = var.dimshuffle(params_dimshuffle_pattern)
else: else:
gamma = T.addbroadcast(gamma, *axes) gamma = tt.addbroadcast(gamma, *axes)
beta = T.addbroadcast(beta, *axes) beta = tt.addbroadcast(beta, *axes)
mean = T.addbroadcast(mean, *axes) mean = tt.addbroadcast(mean, *axes)
var = T.addbroadcast(var, *axes) var = tt.addbroadcast(var, *axes)
batchnorm_op = AbstractBatchNormInference(axes=axes) batchnorm_op = AbstractBatchNormInference(axes=axes)
return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon) return batchnorm_op(inputs, gamma, beta, mean, var, epsilon=epsilon)
...@@ -610,14 +610,13 @@ class AbstractBatchNormInference(Op): ...@@ -610,14 +610,13 @@ class AbstractBatchNormInference(Op):
) )
scale, bias, est_mean, est_var = ( scale, bias, est_mean, est_var = (
theano.tensor.addbroadcast(t, *axes) tt.addbroadcast(t, *axes) for t in (scale, bias, est_mean, est_var)
for t in (scale, bias, est_mean, est_var)
) )
# define helper expressions # define helper expressions
est_var_eps = est_var + epsilon est_var_eps = est_var + epsilon
est_std = theano.tensor.sqrt(est_var_eps) est_std = tt.sqrt(est_var_eps)
two = theano.tensor.constant(2.0) two = tt.constant(2.0)
# define and return gradients # define and return gradients
dx = dy * (scale / est_std) dx = dy * (scale / est_std)
...@@ -673,7 +672,7 @@ class AbstractBatchNormTrainGrad(Op): ...@@ -673,7 +672,7 @@ class AbstractBatchNormTrainGrad(Op):
ddinputs, ddscale, ddbias = grads ddinputs, ddscale, ddbias = grads
x_diff = x - x_mean x_diff = x - x_mean
mean_dy_x_diff = T.mean(dy * x_diff, axis=self.axes, keepdims=True) mean_dy_x_diff = tt.mean(dy * x_diff, axis=self.axes, keepdims=True)
# compute gradients given each of the output gradients # compute gradients given each of the output gradients
g_wrt_x = 0 g_wrt_x = 0
...@@ -683,10 +682,10 @@ class AbstractBatchNormTrainGrad(Op): ...@@ -683,10 +682,10 @@ class AbstractBatchNormTrainGrad(Op):
g_wrt_x_invstd = 0 g_wrt_x_invstd = 0
if not isinstance(ddinputs.type, theano.gradient.DisconnectedType): if not isinstance(ddinputs.type, theano.gradient.DisconnectedType):
ccc = scale * (ddinputs - T.mean(ddinputs, axis=self.axes, keepdims=True)) ccc = scale * (ddinputs - tt.mean(ddinputs, axis=self.axes, keepdims=True))
ddd = (x_invstd ** 3) * ( ddd = (x_invstd ** 3) * (
ccc * T.mean(dy * x_diff, axis=self.axes, keepdims=True) ccc * tt.mean(dy * x_diff, axis=self.axes, keepdims=True)
+ dy * T.mean(ccc * x_diff, axis=self.axes, keepdims=True) + dy * tt.mean(ccc * x_diff, axis=self.axes, keepdims=True)
) )
g_wrt_x = g_wrt_x - ddd g_wrt_x = g_wrt_x - ddd
...@@ -695,19 +694,19 @@ class AbstractBatchNormTrainGrad(Op): ...@@ -695,19 +694,19 @@ class AbstractBatchNormTrainGrad(Op):
- ( - (
(x_invstd ** 3) (x_invstd ** 3)
* x_diff * x_diff
* T.mean(ccc * x_diff, axis=self.axes, keepdims=True) * tt.mean(ccc * x_diff, axis=self.axes, keepdims=True)
) )
) )
eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff) eee = (dy * x_invstd) - ((x_invstd ** 3) * x_diff * mean_dy_x_diff)
g_wrt_scale = g_wrt_scale + T.sum( g_wrt_scale = g_wrt_scale + tt.sum(
ddinputs * (eee - T.mean(eee, axis=self.axes, keepdims=True)), ddinputs * (eee - tt.mean(eee, axis=self.axes, keepdims=True)),
axis=self.axes, axis=self.axes,
keepdims=True, keepdims=True,
) )
g_wrt_x_mean = g_wrt_x_mean + T.sum(ddd, axis=self.axes, keepdims=True) g_wrt_x_mean = g_wrt_x_mean + tt.sum(ddd, axis=self.axes, keepdims=True)
g_wrt_x_invstd = g_wrt_x_invstd + T.sum( g_wrt_x_invstd = g_wrt_x_invstd + tt.sum(
ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff), ccc * (dy - 3 * (x_invstd ** 2) * x_diff * mean_dy_x_diff),
axis=self.axes, axis=self.axes,
keepdims=True, keepdims=True,
...@@ -717,14 +716,14 @@ class AbstractBatchNormTrainGrad(Op): ...@@ -717,14 +716,14 @@ class AbstractBatchNormTrainGrad(Op):
g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy) g_wrt_x = g_wrt_x + (x_invstd * ddscale * dy)
g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff) g_wrt_dy = g_wrt_dy + (x_invstd * ddscale * x_diff)
g_wrt_x_mean = g_wrt_x_mean - ( g_wrt_x_mean = g_wrt_x_mean - (
x_invstd * ddscale * T.sum(dy, axis=self.axes, keepdims=True) x_invstd * ddscale * tt.sum(dy, axis=self.axes, keepdims=True)
) )
g_wrt_x_invstd = g_wrt_x_invstd + ( g_wrt_x_invstd = g_wrt_x_invstd + (
ddscale * T.sum(dy * x_diff, axis=self.axes, keepdims=True) ddscale * tt.sum(dy * x_diff, axis=self.axes, keepdims=True)
) )
if not isinstance(ddbias.type, theano.gradient.DisconnectedType): if not isinstance(ddbias.type, theano.gradient.DisconnectedType):
g_wrt_dy = g_wrt_dy + T.fill(dy, ddbias) g_wrt_dy = g_wrt_dy + tt.fill(dy, ddbias)
# depending on which output gradients are given, # depending on which output gradients are given,
# some inputs should be disconnected # some inputs should be disconnected
...@@ -804,7 +803,7 @@ def local_abstract_batch_norm_train(node): ...@@ -804,7 +803,7 @@ def local_abstract_batch_norm_train(node):
# The epsilon should not upcast the dtype. # The epsilon should not upcast the dtype.
if var.dtype == "float32" and epsilon.dtype == "float64": if var.dtype == "float32" and epsilon.dtype == "float64":
epsilon = epsilon.astype("float32") epsilon = epsilon.astype("float32")
invstd = T.inv(T.sqrt(var + epsilon)) invstd = tt.inv(tt.sqrt(var + epsilon))
out = (x - mean) * (scale * invstd) + bias out = (x - mean) * (scale * invstd) + bias
results = [out, mean, invstd] results = [out, mean, invstd]
...@@ -816,7 +815,7 @@ def local_abstract_batch_norm_train(node): ...@@ -816,7 +815,7 @@ def local_abstract_batch_norm_train(node):
) )
results.append(running_mean) results.append(running_mean)
if len(node.inputs) > 6: if len(node.inputs) > 6:
m = T.cast(T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) m = tt.cast(tt.prod(x.shape) / tt.prod(scale.shape), theano.config.floatX)
running_var = node.inputs[6] running_var = node.inputs[6]
running_var = ( running_var = (
running_var * (1.0 - running_average_factor) running_var * (1.0 - running_average_factor)
...@@ -825,7 +824,7 @@ def local_abstract_batch_norm_train(node): ...@@ -825,7 +824,7 @@ def local_abstract_batch_norm_train(node):
results.append(running_var) results.append(running_var)
results = [ results = [
T.patternbroadcast(r, r_orig.broadcastable) tt.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs) for (r, r_orig) in zip(results, node.outputs)
] ]
...@@ -855,16 +854,16 @@ def local_abstract_batch_norm_train_grad(node): ...@@ -855,16 +854,16 @@ def local_abstract_batch_norm_train_grad(node):
return None return None
x_diff = x - x_mean x_diff = x - x_mean
mean_dy_x_diff = T.mean(dy * x_diff, axis=axes, keepdims=True) mean_dy_x_diff = tt.mean(dy * x_diff, axis=axes, keepdims=True)
c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3)) c = (dy * x_invstd) - x_diff * (mean_dy_x_diff * (x_invstd ** 3))
g_wrt_inputs = scale * (c - T.mean(c, axis=axes, keepdims=True)) g_wrt_inputs = scale * (c - tt.mean(c, axis=axes, keepdims=True))
g_wrt_scale = T.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True) g_wrt_scale = tt.sum(dy * x_invstd * x_diff, axis=axes, keepdims=True)
g_wrt_bias = T.sum(dy, axis=axes, keepdims=True) g_wrt_bias = tt.sum(dy, axis=axes, keepdims=True)
results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias] results = [g_wrt_inputs, g_wrt_scale, g_wrt_bias]
results = [ results = [
T.patternbroadcast(r, r_orig.broadcastable) tt.patternbroadcast(r, r_orig.broadcastable)
for (r, r_orig) in zip(results, node.outputs) for (r, r_orig) in zip(results, node.outputs)
] ]
...@@ -896,9 +895,9 @@ def local_abstract_batch_norm_inference(node): ...@@ -896,9 +895,9 @@ def local_abstract_batch_norm_inference(node):
epsilon = epsilon.astype("float32") epsilon = epsilon.astype("float32")
result = (x - estimated_mean) * ( result = (x - estimated_mean) * (
scale / T.sqrt(estimated_variance + epsilon) scale / tt.sqrt(estimated_variance + epsilon)
) + bias ) + bias
result = T.patternbroadcast(result, node.outputs[0].broadcastable) result = tt.patternbroadcast(result, node.outputs[0].broadcastable)
for var in theano.gof.graph.variables(node.inputs, [result]): for var in theano.gof.graph.variables(node.inputs, [result]):
if var not in node.inputs: if var not in node.inputs:
......
import os import os
import sys import sys
import theano.tensor as T
from theano import config import theano.tensor as tt
from theano import gof
from theano import config, gof
from theano.gof import local_optimizer from theano.gof import local_optimizer
from theano.gof.cmodule import GCC_compiler from theano.gof.cmodule import GCC_compiler
from theano.tensor.opt import register_canonicalize
from theano.tensor.extra_ops import cpu_contiguous
from theano.gradient import grad_undefined from theano.gradient import grad_undefined
from theano.tensor.extra_ops import cpu_contiguous
from theano.tensor.opt import register_canonicalize
def _ctc_find_lib(): def _ctc_find_lib():
...@@ -156,12 +157,12 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp): ...@@ -156,12 +157,12 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
return ["ctc.h"] + gof.OpenMPOp.c_headers(self) return ["ctc.h"] + gof.OpenMPOp.c_headers(self)
def make_node(self, activations, labels, input_lengths): def make_node(self, activations, labels, input_lengths):
t_activations = T.as_tensor_variable(activations) t_activations = tt.as_tensor_variable(activations)
# Ensure activations array is C-contiguous # Ensure activations array is C-contiguous
t_activations = cpu_contiguous(t_activations) t_activations = cpu_contiguous(t_activations)
t_labels = T.as_tensor_variable(labels) t_labels = tt.as_tensor_variable(labels)
t_input_lengths = T.as_tensor_variable(input_lengths) t_input_lengths = tt.as_tensor_variable(input_lengths)
if t_activations.type.dtype != "float32": if t_activations.type.dtype != "float32":
raise TypeError("activations must use the float32 type!") raise TypeError("activations must use the float32 type!")
...@@ -181,10 +182,10 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp): ...@@ -181,10 +182,10 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
if t_input_lengths.ndim != 1: if t_input_lengths.ndim != 1:
raise ValueError("input_lengths must have 1 dimension.") raise ValueError("input_lengths must have 1 dimension.")
costs = T.fvector(name="ctc_cost") costs = tt.fvector(name="ctc_cost")
outputs = [costs] outputs = [costs]
if self.compute_grad: if self.compute_grad:
gradients = T.ftensor3(name="ctc_grad") gradients = tt.ftensor3(name="ctc_grad")
outputs += [gradients] outputs += [gradients]
return gof.Apply( return gof.Apply(
...@@ -197,9 +198,9 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp): ...@@ -197,9 +198,9 @@ class ConnectionistTemporalClassification(gof.COp, gof.OpenMPOp):
assert gradients is not None assert gradients is not None
grad_op = output_grads[0] grad_op = output_grads[0]
total_grad = T.basic.batched_dot( total_grad = tt.batched_dot(grad_op, gradients.dimshuffle(1, 0, 2)).dimshuffle(
grad_op, gradients.dimshuffle(1, 0, 2) 1, 0, 2
).dimshuffle(1, 0, 2) )
return [ return [
total_grad, total_grad,
grad_undefined(self, 1, inputs[1]), grad_undefined(self, 1, inputs[1]),
......
...@@ -2,16 +2,14 @@ ...@@ -2,16 +2,14 @@
TODO: implement Images2Neibs.infer_shape() methods TODO: implement Images2Neibs.infer_shape() methods
""" """
import numpy as np import numpy as np
import theano import theano
from theano import Op, Apply import theano.tensor as tt
from theano import Apply, Op
from theano.gof import EnumList from theano.gof import EnumList
import theano.tensor as T from theano.gradient import grad_not_implemented, grad_undefined
from theano.gradient import grad_not_implemented
from theano.gradient import grad_undefined
class Images2Neibs(Op): class Images2Neibs(Op):
...@@ -102,19 +100,19 @@ class Images2Neibs(Op): ...@@ -102,19 +100,19 @@ class Images2Neibs(Op):
pattern. pattern.
""" """
ten4 = T.as_tensor_variable(ten4) ten4 = tt.as_tensor_variable(ten4)
neib_shape = T.as_tensor_variable(neib_shape) neib_shape = tt.as_tensor_variable(neib_shape)
if neib_step is None: if neib_step is None:
neib_step = neib_shape neib_step = neib_shape
else: else:
neib_step = T.as_tensor_variable(neib_step) neib_step = tt.as_tensor_variable(neib_step)
assert ten4.ndim == 4 assert ten4.ndim == 4
assert neib_shape.ndim == 1 assert neib_shape.ndim == 1
assert neib_step.ndim == 1 assert neib_step.ndim == 1
return Apply( return Apply(
self, [ten4, neib_shape, neib_step], [T.matrix(dtype=ten4.type.dtype)] self, [ten4, neib_shape, neib_step], [tt.matrix(dtype=ten4.type.dtype)]
) )
def grad(self, inp, grads): def grad(self, inp, grads):
...@@ -165,14 +163,14 @@ class Images2Neibs(Op): ...@@ -165,14 +163,14 @@ class Images2Neibs(Op):
+ ((rows - nrows) // rstep + 1,) + ((rows - nrows) // rstep + 1,)
+ ((cols - ncols) // cstep + 1,) + ((cols - ncols) // cstep + 1,)
) )
return T.inc_subtensor(result_indices, pgz.reshape(newshape)) return tt.inc_subtensor(result_indices, pgz.reshape(newshape))
indices = T.arange(neib_shape[0] * neib_shape[1]) indices = tt.arange(neib_shape[0] * neib_shape[1])
pgzs = gz.dimshuffle((1, 0)) pgzs = gz.dimshuffle((1, 0))
result, _ = theano.scan( result, _ = theano.scan(
fn=pos2map, fn=pos2map,
sequences=[indices, pgzs], sequences=[indices, pgzs],
outputs_info=T.zeros(x.shape), outputs_info=tt.zeros(x.shape),
non_sequences=[neib_shape, neib_step], non_sequences=[neib_shape, neib_step],
) )
grad_input = result[-1] grad_input = result[-1]
...@@ -354,8 +352,8 @@ class Images2Neibs(Op): ...@@ -354,8 +352,8 @@ class Images2Neibs(Op):
c, d = node.inputs[1] c, d = node.inputs[1]
step_x, step_y = node.inputs[2] step_x, step_y = node.inputs[2]
if self.mode == "wrap_centered": if self.mode == "wrap_centered":
grid_c = T.ceil_intdiv(in_shape[2], step_x) grid_c = tt.ceil_intdiv(in_shape[2], step_x)
grid_d = T.ceil_intdiv(in_shape[3], step_y) grid_d = tt.ceil_intdiv(in_shape[3], step_y)
elif self.mode == "valid": elif self.mode == "valid":
grid_c = 1 + ((in_shape[2] - c) // step_x) grid_c = 1 + ((in_shape[2] - c) // step_x)
grid_d = 1 + ((in_shape[3] - d) // step_y) grid_d = 1 + ((in_shape[3] - d) // step_y)
...@@ -795,11 +793,11 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"): ...@@ -795,11 +793,11 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
.. note:: The code will output the initial image array. .. note:: The code will output the initial image array.
""" """
neibs = T.as_tensor_variable(neibs) neibs = tt.as_tensor_variable(neibs)
neib_shape = T.as_tensor_variable(neib_shape) neib_shape = tt.as_tensor_variable(neib_shape)
original_shape = T.as_tensor_variable(original_shape) original_shape = tt.as_tensor_variable(original_shape)
new_neib_shape = T.stack([original_shape[-1] // neib_shape[1], neib_shape[1]]) new_neib_shape = tt.stack([original_shape[-1] // neib_shape[1], neib_shape[1]])
output_2d = images2neibs( output_2d = images2neibs(
neibs.dimshuffle("x", "x", 0, 1), new_neib_shape, mode=mode neibs.dimshuffle("x", "x", 0, 1), new_neib_shape, mode=mode
) )
...@@ -809,10 +807,10 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"): ...@@ -809,10 +807,10 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
# the shape and still raise error when it don't have the right # the shape and still raise error when it don't have the right
# shape. # shape.
valid_shape = original_shape valid_shape = original_shape
valid_shape = T.set_subtensor( valid_shape = tt.set_subtensor(
valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0] valid_shape[2], (valid_shape[2] // neib_shape[0]) * neib_shape[0]
) )
valid_shape = T.set_subtensor( valid_shape = tt.set_subtensor(
valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1] valid_shape[3], (valid_shape[3] // neib_shape[1]) * neib_shape[1]
) )
output_4d = output_2d.reshape(valid_shape, ndim=4) output_4d = output_2d.reshape(valid_shape, ndim=4)
...@@ -820,7 +818,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"): ...@@ -820,7 +818,7 @@ def neibs2images(neibs, neib_shape, original_shape, mode="valid"):
for d in [2, 3]: for d in [2, 3]:
pad_shape = list(output_4d.shape) pad_shape = list(output_4d.shape)
pad_shape[d] = original_shape[d] - valid_shape[d] pad_shape[d] = original_shape[d] - valid_shape[d]
output_4d = T.concatenate([output_4d, T.zeros(pad_shape)], axis=d) output_4d = tt.concatenate([output_4d, tt.zeros(pad_shape)], axis=d)
elif mode == "valid": elif mode == "valid":
# TODO: we do not implement all mode with this code. # TODO: we do not implement all mode with this code.
# Add a check for the good cases. # Add a check for the good cases.
......
...@@ -15,31 +15,37 @@ revisited later when all the intermediate part are on the GPU. ...@@ -15,31 +15,37 @@ revisited later when all the intermediate part are on the GPU.
import logging import logging
import warnings import warnings
import numpy as np
import numpy as np
import theano import theano
from theano import gof import theano.tensor.basic as tt
from theano import scalar from theano import scalar
from theano.tensor import extra_ops, as_tensor_variable
from theano.gof.opt import copy_stack_trace
from theano.tensor import basic as tensor, subtensor, opt, elemwise
from theano.tensor.type import values_eq_approx_remove_inf, values_eq_approx_remove_nan
from theano.compile import optdb from theano.compile import optdb
from theano.gof import Apply from theano.gof.graph import Apply
from theano.gof.op import Op
from theano.tensor.nnet.sigm import sigmoid, softplus from theano.tensor.opt import (
from theano.gradient import DisconnectedType register_specialize,
from theano.gradient import grad_not_implemented register_stabilize,
register_canonicalize,
)
from theano.gof.opt import (
optimizer,
copy_stack_trace,
local_optimizer,
)
from theano.gradient import DisconnectedType, grad_not_implemented
from theano.scalar import UnaryScalarOp
from theano.tensor import as_tensor_variable, extra_ops, opt, subtensor
from theano.tensor.elemwise import Elemwise
from theano.tensor.subtensor import AdvancedSubtensor
from theano.tensor.basic import log, MaxAndArgmax
from theano.tensor.nnet.blocksparse import sparse_block_dot from theano.tensor.nnet.blocksparse import sparse_block_dot
from theano.tensor.nnet.sigm import sigmoid, softplus
############ from theano.tensor.type import values_eq_approx_remove_inf, values_eq_approx_remove_nan
#
# TENSOR OPS
#
class SoftmaxWithBias(gof.Op): class SoftmaxWithBias(Op):
""" """
An L{Op} for the output of neural-net multiclass classifiers. An L{Op} for the output of neural-net multiclass classifiers.
...@@ -58,11 +64,11 @@ class SoftmaxWithBias(gof.Op): ...@@ -58,11 +64,11 @@ class SoftmaxWithBias(gof.Op):
__props__ = () __props__ = ()
def make_node(self, x, b): def make_node(self, x, b):
x = tensor.as_tensor_variable(x) x = tt.as_tensor_variable(x)
b = tensor.as_tensor_variable(b) b = tt.as_tensor_variable(b)
if x.type.ndim != 2 or x.type.dtype not in tensor.float_dtypes: if x.type.ndim != 2 or x.type.dtype not in tt.float_dtypes:
raise ValueError("x must be 2-d tensor of floats") raise ValueError("x must be 2-d tensor of floats")
if b.type.ndim != 1 or b.type.dtype not in tensor.float_dtypes: if b.type.ndim != 1 or b.type.dtype not in tt.float_dtypes:
raise ValueError("b must be 1-d tensor of floats") raise ValueError("b must be 1-d tensor of floats")
sm = x.type() sm = x.type()
...@@ -105,7 +111,7 @@ class SoftmaxWithBias(gof.Op): ...@@ -105,7 +111,7 @@ class SoftmaxWithBias(gof.Op):
return [DisconnectedType()(), DisconnectedType()()] return [DisconnectedType()(), DisconnectedType()()]
dx = softmax_grad(g_sm, outputs[0]) dx = softmax_grad(g_sm, outputs[0])
db = tensor.sum(dx, axis=0) db = tt.sum(dx, axis=0)
return dx, db return dx, db
def infer_shape(self, node, shape): def infer_shape(self, node, shape):
...@@ -295,7 +301,7 @@ class SoftmaxWithBias(gof.Op): ...@@ -295,7 +301,7 @@ class SoftmaxWithBias(gof.Op):
softmax_with_bias = SoftmaxWithBias() softmax_with_bias = SoftmaxWithBias()
class SoftmaxGrad(gof.Op): class SoftmaxGrad(Op):
""" """
Gradient wrt x of the Softmax Op. Gradient wrt x of the Softmax Op.
...@@ -306,14 +312,14 @@ class SoftmaxGrad(gof.Op): ...@@ -306,14 +312,14 @@ class SoftmaxGrad(gof.Op):
__props__ = () __props__ = ()
def make_node(self, dy, sm): def make_node(self, dy, sm):
dy = tensor.as_tensor_variable(dy) dy = tt.as_tensor_variable(dy)
sm = tensor.as_tensor_variable(sm) sm = tt.as_tensor_variable(sm)
if dy.type.ndim not in (1, 2) or dy.type.dtype not in tensor.float_dtypes: if dy.type.ndim not in (1, 2) or dy.type.dtype not in tt.float_dtypes:
raise ValueError("dy must be 1-d or 2-d tensor of floats. Got ", dy.type) raise ValueError("dy must be 1-d or 2-d tensor of floats. Got ", dy.type)
if dy.ndim == 1: if dy.ndim == 1:
dy = tensor.shape_padleft(dy, n_ones=1) dy = tt.shape_padleft(dy, n_ones=1)
if sm.ndim == 1: if sm.ndim == 1:
sm = tensor.shape_padleft(sm, n_ones=1) sm = tt.shape_padleft(sm, n_ones=1)
return Apply(self, [dy, sm], [sm.type()]) return Apply(self, [dy, sm], [sm.type()])
def perform(self, node, input_storage, output_storage): def perform(self, node, input_storage, output_storage):
...@@ -329,10 +335,10 @@ class SoftmaxGrad(gof.Op): ...@@ -329,10 +335,10 @@ class SoftmaxGrad(gof.Op):
dy, sm = inp dy, sm = inp
(g,) = grads (g,) = grads
tmp = g + tensor.neg(tensor.sum(g * sm, axis=1).dimshuffle((0, "x"))) tmp = g + tt.neg(tt.sum(g * sm, axis=1).dimshuffle((0, "x")))
g_dy = tmp * sm g_dy = tmp * sm
tmp2 = tensor.sum(dy * sm, axis=1).dimshuffle((0, "x")) tmp2 = tt.sum(dy * sm, axis=1).dimshuffle((0, "x"))
g_sm = tmp * dy - g * tmp2 g_sm = tmp * dy - g * tmp2
return g_dy, g_sm return g_dy, g_sm
...@@ -416,7 +422,7 @@ class SoftmaxGrad(gof.Op): ...@@ -416,7 +422,7 @@ class SoftmaxGrad(gof.Op):
softmax_grad = SoftmaxGrad() softmax_grad = SoftmaxGrad()
class Softmax(gof.Op): class Softmax(Op):
r""" r"""
Softmax activation function Softmax activation function
:math:`\\varphi(\\mathbf{x})_j = :math:`\\varphi(\\mathbf{x})_j =
...@@ -431,8 +437,8 @@ class Softmax(gof.Op): ...@@ -431,8 +437,8 @@ class Softmax(gof.Op):
__props__ = () __props__ = ()
def make_node(self, x): def make_node(self, x):
x = tensor.as_tensor_variable(x) x = tt.as_tensor_variable(x)
if x.type.ndim not in (1, 2) or x.type.dtype not in tensor.float_dtypes: if x.type.ndim not in (1, 2) or x.type.dtype not in tt.float_dtypes:
raise ValueError("x must be 1-d or 2-d tensor of floats. Got %s" % x.type) raise ValueError("x must be 1-d or 2-d tensor of floats. Got %s" % x.type)
if x.ndim == 1: if x.ndim == 1:
warnings.warn( warnings.warn(
...@@ -441,7 +447,7 @@ class Softmax(gof.Op): ...@@ -441,7 +447,7 @@ class Softmax(gof.Op):
"vector case is gonna be supported soon and the output will be a vector.", "vector case is gonna be supported soon and the output will be a vector.",
stacklevel=4, stacklevel=4,
) )
x = tensor.shape_padleft(x, n_ones=1) x = tt.shape_padleft(x, n_ones=1)
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
...@@ -616,7 +622,7 @@ class Softmax(gof.Op): ...@@ -616,7 +622,7 @@ class Softmax(gof.Op):
softmax_op = Softmax() softmax_op = Softmax()
class LogSoftmax(gof.Op): class LogSoftmax(Op):
r""" r"""
LogSoftmax activation function LogSoftmax activation function
:math:`\\varphi(\\mathbf{x})_j = :math:`\\varphi(\\mathbf{x})_j =
...@@ -629,8 +635,8 @@ class LogSoftmax(gof.Op): ...@@ -629,8 +635,8 @@ class LogSoftmax(gof.Op):
__props__ = () __props__ = ()
def make_node(self, x): def make_node(self, x):
x = tensor.as_tensor_variable(x) x = tt.as_tensor_variable(x)
if x.type.ndim not in (1, 2) or x.type.dtype not in tensor.float_dtypes: if x.type.ndim not in (1, 2) or x.type.dtype not in tt.float_dtypes:
raise ValueError("x must be 1-d or 2-d tensor of floats. Got %s" % x.type) raise ValueError("x must be 1-d or 2-d tensor of floats. Got %s" % x.type)
if x.ndim == 1: if x.ndim == 1:
warnings.warn( warnings.warn(
...@@ -639,7 +645,7 @@ class LogSoftmax(gof.Op): ...@@ -639,7 +645,7 @@ class LogSoftmax(gof.Op):
"vector case is gonna be supported soon and the output will be a vector.", "vector case is gonna be supported soon and the output will be a vector.",
stacklevel=4, stacklevel=4,
) )
x = tensor.shape_padleft(x, n_ones=1) x = tt.shape_padleft(x, n_ones=1)
return Apply(self, [x], [x.type()]) return Apply(self, [x], [x.type()])
...@@ -652,7 +658,7 @@ class LogSoftmax(gof.Op): ...@@ -652,7 +658,7 @@ class LogSoftmax(gof.Op):
def grad(self, inp, grads): def grad(self, inp, grads):
(x,) = inp (x,) = inp
sm = softmax_op(x) sm = softmax_op(x)
return [grads[0] - tensor.sum(grads[0], axis=1, keepdims=True) * sm] return [grads[0] - tt.sum(grads[0], axis=1, keepdims=True) * sm]
def R_op(self, inputs, eval_points): def R_op(self, inputs, eval_points):
# I think the Jacobian is symmetric so the R_op # I think the Jacobian is symmetric so the R_op
...@@ -765,8 +771,8 @@ logsoftmax_op = LogSoftmax() ...@@ -765,8 +771,8 @@ logsoftmax_op = LogSoftmax()
# This is not registered in stabilize, as it cause some crossentropy # This is not registered in stabilize, as it cause some crossentropy
# optimization to not be inserted. # optimization to not be inserted.
@opt.register_specialize("stabilize", "fast_compile") @register_specialize("stabilize", "fast_compile")
@gof.local_optimizer([tensor.Elemwise]) @local_optimizer([Elemwise])
def local_logsoftmax(node): def local_logsoftmax(node):
""" """
Detect Log(Softmax(x)) and replace it with LogSoftmax(x) Detect Log(Softmax(x)) and replace it with LogSoftmax(x)
...@@ -774,7 +780,7 @@ def local_logsoftmax(node): ...@@ -774,7 +780,7 @@ def local_logsoftmax(node):
Note: only forward pass is affected Note: only forward pass is affected
""" """
if ( if (
isinstance(node.op, tensor.Elemwise) isinstance(node.op, Elemwise)
and isinstance(node.op.scalar_op, scalar.basic.Log) and isinstance(node.op.scalar_op, scalar.basic.Log)
and len(node.inputs) == 1 and len(node.inputs) == 1
and node.inputs[0].owner is not None and node.inputs[0].owner is not None
...@@ -790,8 +796,8 @@ def local_logsoftmax(node): ...@@ -790,8 +796,8 @@ def local_logsoftmax(node):
# This is not registered in stabilize, as it cause some crossentropy # This is not registered in stabilize, as it cause some crossentropy
# optimization to not be inserted. # optimization to not be inserted.
@opt.register_specialize("stabilize", "fast_compile") @register_specialize("stabilize", "fast_compile")
@gof.local_optimizer([SoftmaxGrad]) @local_optimizer([SoftmaxGrad])
def local_logsoftmax_grad(node): def local_logsoftmax_grad(node):
""" """
Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad Detect Log(Softmax(x))'s grad and replace it with LogSoftmax(x)'s grad
...@@ -802,7 +808,7 @@ def local_logsoftmax_grad(node): ...@@ -802,7 +808,7 @@ def local_logsoftmax_grad(node):
isinstance(node.op, SoftmaxGrad) isinstance(node.op, SoftmaxGrad)
and len(node.inputs) == 2 and len(node.inputs) == 2
and node.inputs[0].owner is not None and node.inputs[0].owner is not None
and node.inputs[0].owner.op == tensor.true_div and node.inputs[0].owner.op == tt.true_div
and len(node.inputs[0].owner.inputs) >= 2 and len(node.inputs[0].owner.inputs) >= 2
and node.inputs[0].owner.inputs[1].owner is not None and node.inputs[0].owner.inputs[1].owner is not None
and node.inputs[0].owner.inputs[1].owner.op == softmax_op and node.inputs[0].owner.inputs[1].owner.op == softmax_op
...@@ -810,7 +816,7 @@ def local_logsoftmax_grad(node): ...@@ -810,7 +816,7 @@ def local_logsoftmax_grad(node):
and not ( and not (
# skip if it will be optimized by # skip if it will be optimized by
# local_advanced_indexing_crossentropy_onehot_grad # local_advanced_indexing_crossentropy_onehot_grad
node.inputs[0].owner.op == tensor.true_div node.inputs[0].owner.op == tt.true_div
and node.inputs[0].owner.inputs[0].owner is not None and node.inputs[0].owner.inputs[0].owner is not None
and isinstance( and isinstance(
node.inputs[0].owner.inputs[0].owner.op, subtensor.AdvancedIncSubtensor node.inputs[0].owner.inputs[0].owner.op, subtensor.AdvancedIncSubtensor
...@@ -822,15 +828,15 @@ def local_logsoftmax_grad(node): ...@@ -822,15 +828,15 @@ def local_logsoftmax_grad(node):
# sm_input = node.inputs[1].owner.inputs[0] # sm_input = node.inputs[1].owner.inputs[0]
grads = node.inputs[0].owner.inputs[0] grads = node.inputs[0].owner.inputs[0]
if grads.broadcastable[1] and not sm.broadcastable[1]: if grads.broadcastable[1] and not sm.broadcastable[1]:
grads = tensor.alloc(grads, grads.shape[0], sm.shape[1]) grads = tt.alloc(grads, grads.shape[0], sm.shape[1])
ret = grads - tensor.sum(grads, axis=1, keepdims=True) * sm ret = grads - tt.sum(grads, axis=1, keepdims=True) * sm
ret.tag.values_eq_approx = values_eq_approx_remove_nan ret.tag.values_eq_approx = values_eq_approx_remove_nan
copy_stack_trace(node.outputs[0], ret) copy_stack_trace(node.outputs[0], ret)
return [ret] return [ret]
def softmax_graph(c): def softmax_graph(c):
return tensor.exp(c) / tensor.exp(c).sum(axis=-1, keepdims=True) return tt.exp(c) / tt.exp(c).sum(axis=-1, keepdims=True)
def softmax(c): def softmax(c):
...@@ -846,8 +852,8 @@ def logsoftmax(c): ...@@ -846,8 +852,8 @@ def logsoftmax(c):
return logsoftmax_op(c) return logsoftmax_op(c)
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.local_optimizer([softmax_op]) @local_optimizer([softmax_op])
def local_softmax_with_bias(node): def local_softmax_with_bias(node):
""" """
Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias). Try to turn softmax(sum_of_stuff) -> softmax_w_bias(matrix, bias).
...@@ -855,25 +861,25 @@ def local_softmax_with_bias(node): ...@@ -855,25 +861,25 @@ def local_softmax_with_bias(node):
""" """
if node.op == softmax_op: if node.op == softmax_op:
(x,) = node.inputs (x,) = node.inputs
if x.owner and x.owner.op == tensor.add: if x.owner and x.owner.op == tt.add:
vectors = [] vectors = []
non_vectors = [] non_vectors = []
for x_in in x.owner.inputs: for x_in in x.owner.inputs:
if list(x_in.type.broadcastable) == [True, False]: if list(x_in.type.broadcastable) == [True, False]:
# print isinstance(x_in.owner.op, # print isinstance(x_in.owner.op,
# tensor.DimShuffle) since specialization comes # tt.DimShuffle) since specialization comes
# relatively late in optimization, we don't want to # relatively late in optimization, we don't want to
# put in extra DimShuffles un-necessarily. # put in extra DimShuffles un-necessarily.
if ( if (
x_in.owner x_in.owner
and isinstance(x_in.owner.op, tensor.DimShuffle) and isinstance(x_in.owner.op, tt.DimShuffle)
and list(x_in.owner.inputs[0].type.broadcastable) == [False] and list(x_in.owner.inputs[0].type.broadcastable) == [False]
): ):
# cut out the DimShuffle that was broadcasting a vector # cut out the DimShuffle that was broadcasting a vector
vectors.append(x_in.owner.inputs[0]) vectors.append(x_in.owner.inputs[0])
else: else:
# insert an extra DimShuffle to correct the old one # insert an extra DimShuffle to correct the old one
vectors.append(tensor.DimShuffle((True, False), (1,))(x_in)) vectors.append(tt.DimShuffle((True, False), (1,))(x_in))
else: else:
non_vectors.append(x_in) non_vectors.append(x_in)
...@@ -882,19 +888,19 @@ def local_softmax_with_bias(node): ...@@ -882,19 +888,19 @@ def local_softmax_with_bias(node):
if len(non_vectors) == 0: if len(non_vectors) == 0:
assert len(vectors) > 0 # we should have at least 1 input... assert len(vectors) > 0 # we should have at least 1 input...
promoted_vector = vectors.pop() promoted_vector = vectors.pop()
non_vectors.append(tensor.shape_padleft(promoted_vector)) non_vectors.append(tt.shape_padleft(promoted_vector))
assert non_vectors # not empty assert non_vectors # not empty
if vectors: if vectors:
# we're in business... # we're in business...
if len(vectors) > 1: if len(vectors) > 1:
vector_sum = tensor.add(*vectors) vector_sum = tt.add(*vectors)
copy_stack_trace(x_in, vector_sum) copy_stack_trace(x_in, vector_sum)
else: else:
vector_sum = vectors[0] vector_sum = vectors[0]
if len(non_vectors) > 1: if len(non_vectors) > 1:
non_vector_sum = tensor.add(*non_vectors) non_vector_sum = tt.add(*non_vectors)
copy_stack_trace(x_in, non_vector_sum) copy_stack_trace(x_in, non_vector_sum)
else: else:
non_vector_sum = non_vectors[0] non_vector_sum = non_vectors[0]
...@@ -921,7 +927,7 @@ def softmax_simplifier(numerators, denominators): ...@@ -921,7 +927,7 @@ def softmax_simplifier(numerators, denominators):
if numerator.ndim != 2: if numerator.ndim != 2:
continue continue
if numerator.owner and numerator.owner.op == tensor.exp: if numerator.owner and numerator.owner.op == tt.exp:
x = numerator.owner.inputs[0] x = numerator.owner.inputs[0]
else: else:
continue continue
...@@ -929,13 +935,11 @@ def softmax_simplifier(numerators, denominators): ...@@ -929,13 +935,11 @@ def softmax_simplifier(numerators, denominators):
matching_denom = None matching_denom = None
for denominator in denominators: for denominator in denominators:
if denominator.owner and isinstance( if denominator.owner and isinstance(denominator.owner.op, tt.DimShuffle):
denominator.owner.op, tensor.DimShuffle
):
if denominator.owner.op.new_order == (0, "x"): if denominator.owner.op.new_order == (0, "x"):
z = denominator.owner.inputs[0] z = denominator.owner.inputs[0]
# thing getting dimshuffled # thing getting dimshuffled
if z.owner and isinstance(z.owner.op, tensor.Sum): if z.owner and isinstance(z.owner.op, tt.Sum):
# print 'ASDF', denominator.owner.op.new_order # print 'ASDF', denominator.owner.op.new_order
# print z.owner.op.axis # print z.owner.op.axis
if z.owner.op.axis == (1,): if z.owner.op.axis == (1,):
...@@ -956,7 +960,7 @@ def softmax_simplifier(numerators, denominators): ...@@ -956,7 +960,7 @@ def softmax_simplifier(numerators, denominators):
opt.local_mul_canonizer.add_simplifier(softmax_simplifier, "softmax_simplifier") opt.local_mul_canonizer.add_simplifier(softmax_simplifier, "softmax_simplifier")
class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): class CrossentropySoftmaxArgmax1HotWithBias(Op):
""" """
A special compound L{Op} for the output of neural-net classifiers. A special compound L{Op} for the output of neural-net classifiers.
...@@ -994,21 +998,21 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -994,21 +998,21 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
__props__ = () __props__ = ()
def __init__(self, **kwargs): def __init__(self, **kwargs):
gof.Op.__init__(self, **kwargs) Op.__init__(self, **kwargs)
def make_node(self, x, b, y_idx): def make_node(self, x, b, y_idx):
x = tensor.as_tensor_variable(x) x = tt.as_tensor_variable(x)
b = tensor.as_tensor_variable(b) b = tt.as_tensor_variable(b)
y_idx = tensor.as_tensor_variable(y_idx) y_idx = tt.as_tensor_variable(y_idx)
if x.type.ndim != 2 or x.type.dtype not in tensor.float_dtypes: if x.type.ndim != 2 or x.type.dtype not in tt.float_dtypes:
raise ValueError("x must be 2-d tensor of floats", x.type) raise ValueError("x must be 2-d tensor of floats", x.type)
if b.type.ndim != 1 or x.type.dtype not in tensor.float_dtypes: if b.type.ndim != 1 or x.type.dtype not in tt.float_dtypes:
raise ValueError("b must be 1-d tensor of floats", b.type) raise ValueError("b must be 1-d tensor of floats", b.type)
if y_idx.type.ndim != 1 or y_idx.type.dtype not in tensor.discrete_dtypes: if y_idx.type.ndim != 1 or y_idx.type.dtype not in tt.discrete_dtypes:
raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type) raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type)
# TODO: Is this correct? It used to be y, not y_idx # TODO: Is this correct? It used to be y, not y_idx
nll = tensor.TensorType(x.type.dtype, y_idx.type.broadcastable).make_variable() nll = tt.TensorType(x.type.dtype, y_idx.type.broadcastable).make_variable()
# nll = TensorType(x.dtype, y.broadcastable) # nll = TensorType(x.dtype, y.broadcastable)
sm = x.type() sm = x.type()
am = y_idx.type() am = y_idx.type()
...@@ -1092,7 +1096,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -1092,7 +1096,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
if not isinstance(g_nll.type, DisconnectedType): if not isinstance(g_nll.type, DisconnectedType):
nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx) nll, sm = crossentropy_softmax_1hot_with_bias(x, b, y_idx)
dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, sm, y_idx) dx = crossentropy_softmax_1hot_with_bias_dx(g_nll, sm, y_idx)
db = tensor.sum(dx, axis=[0]) db = tt.sum(dx, axis=[0])
dx_terms.append(dx) dx_terms.append(dx)
db_terms.append(db) db_terms.append(db)
...@@ -1215,7 +1219,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op): ...@@ -1215,7 +1219,7 @@ class CrossentropySoftmaxArgmax1HotWithBias(gof.Op):
return code_template % dict(locals(), **sub) return code_template % dict(locals(), **sub)
class CrossentropySoftmax1HotWithBiasDx(gof.Op): class CrossentropySoftmax1HotWithBiasDx(Op):
""" """
Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op. Gradient wrt x of the CrossentropySoftmaxArgmax1HotWithBias Op.
...@@ -1226,14 +1230,14 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op): ...@@ -1226,14 +1230,14 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
__props__ = () __props__ = ()
def make_node(self, dy, sm, y_idx, **kwargs): def make_node(self, dy, sm, y_idx, **kwargs):
dy = tensor.as_tensor_variable(dy) dy = tt.as_tensor_variable(dy)
sm = tensor.as_tensor_variable(sm) sm = tt.as_tensor_variable(sm)
y_idx = tensor.as_tensor_variable(y_idx) y_idx = tt.as_tensor_variable(y_idx)
if dy.type.ndim > 1 or dy.type.dtype not in tensor.float_dtypes: if dy.type.ndim > 1 or dy.type.dtype not in tt.float_dtypes:
raise ValueError("dy must be {0,1}-d tensor of floats", dy.type) raise ValueError("dy must be {0,1}-d tensor of floats", dy.type)
if sm.type.ndim != 2 or sm.type.dtype not in tensor.float_dtypes: if sm.type.ndim != 2 or sm.type.dtype not in tt.float_dtypes:
raise ValueError("sm must be 2-d tensor of floats", sm.type) raise ValueError("sm must be 2-d tensor of floats", sm.type)
if y_idx.type.ndim != 1 or y_idx.type.dtype not in tensor.discrete_dtypes: if y_idx.type.ndim != 1 or y_idx.type.dtype not in tt.discrete_dtypes:
raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type) raise ValueError("y_idx must be 1-d tensor of [u]ints", y_idx.type)
return Apply(self, [dy, sm, y_idx], [sm.type()]) return Apply(self, [dy, sm, y_idx], [sm.type()])
...@@ -1261,12 +1265,10 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op): ...@@ -1261,12 +1265,10 @@ class CrossentropySoftmax1HotWithBiasDx(gof.Op):
# advanced indexing is not working yet. When it works, do it to avoid # advanced indexing is not working yet. When it works, do it to avoid
# potentially misleading behavior in gradient computations! (although # potentially misleading behavior in gradient computations! (although
# typically we should not need the gradient w.r.t. dy). # typically we should not need the gradient w.r.t. dy).
y_idx_range = tensor.arange(y_idx.shape[0]) y_idx_range = tt.arange(y_idx.shape[0])
g_dy = tensor.sum( g_dy = tt.sum(
g_dx g_dx
* subtensor.AdvancedIncSubtensor()( * subtensor.AdvancedIncSubtensor()(sm, tt.fill(dy, -1), y_idx_range, y_idx),
sm, tensor.fill(dy, -1), y_idx_range, y_idx
),
axis=1, axis=1,
) )
g_sm = dy.dimshuffle(0, "x") * g_dx g_sm = dy.dimshuffle(0, "x") * g_dx
...@@ -1394,7 +1396,7 @@ def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs): ...@@ -1394,7 +1396,7 @@ def crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs):
def crossentropy_softmax_1hot(x, y_idx, **kwargs): def crossentropy_softmax_1hot(x, y_idx, **kwargs):
b = tensor.zeros_like(x[0, :]) b = tt.zeros_like(x[0, :])
return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) return crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
...@@ -1415,16 +1417,16 @@ def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs): ...@@ -1415,16 +1417,16 @@ def crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs):
""" """
(xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs) (xent, softmax) = crossentropy_softmax_1hot_with_bias(x, b, y_idx, **kwargs)
(max_pr, argmax) = tensor.max_and_argmax(softmax, axis=-1) (max_pr, argmax) = tt.max_and_argmax(softmax, axis=-1)
return (xent, softmax, max_pr, argmax) return (xent, softmax, max_pr, argmax)
def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs): def crossentropy_softmax_max_and_argmax_1hot(x, y_idx, **kwargs):
b = tensor.zeros_like(x[0, :]) b = tt.zeros_like(x[0, :])
return crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs) return crossentropy_softmax_max_and_argmax_1hot_with_bias(x, b, y_idx, **kwargs)
class CrossentropyCategorical1HotGrad(gof.Op): class CrossentropyCategorical1HotGrad(Op):
__props__ = () __props__ = ()
...@@ -1446,7 +1448,7 @@ class CrossentropyCategorical1HotGrad(gof.Op): ...@@ -1446,7 +1448,7 @@ class CrossentropyCategorical1HotGrad(gof.Op):
crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad() crossentropy_categorical_1hot_grad = CrossentropyCategorical1HotGrad()
class CrossentropyCategorical1Hot(gof.Op): class CrossentropyCategorical1Hot(Op):
r""" r"""
Compute the cross entropy between a coding distribution and Compute the cross entropy between a coding distribution and
a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0]. a true distribution of the form [0, 0, ... 0, 1, 0, ..., 0].
...@@ -1477,20 +1479,20 @@ class CrossentropyCategorical1Hot(gof.Op): ...@@ -1477,20 +1479,20 @@ class CrossentropyCategorical1Hot(gof.Op):
dvector dvector
""" """
_coding_dist = tensor.as_tensor_variable(coding_dist) _coding_dist = tt.as_tensor_variable(coding_dist)
_true_one_of_n = tensor.as_tensor_variable(true_one_of_n) _true_one_of_n = tt.as_tensor_variable(true_one_of_n)
if _coding_dist.type.ndim != 2: if _coding_dist.type.ndim != 2:
raise TypeError("matrix required for argument: coding_dist") raise TypeError("matrix required for argument: coding_dist")
if _true_one_of_n.type not in (tensor.lvector, tensor.ivector): if _true_one_of_n.type not in (tt.lvector, tt.ivector):
raise TypeError( raise TypeError(
"integer vector required for argument: true_one_of_n" "integer vector required for argument: true_one_of_n"
"(got type: %s instead of: %s)" % (_true_one_of_n.type, tensor.lvector) "(got type: %s instead of: %s)" % (_true_one_of_n.type, tt.lvector)
) )
return Apply( return Apply(
self, self,
[_coding_dist, _true_one_of_n], [_coding_dist, _true_one_of_n],
[tensor.Tensor(dtype=_coding_dist.dtype, broadcastable=[False])()], [tt.Tensor(dtype=_coding_dist.dtype, broadcastable=[False])()],
) )
def perform(self, node, inp, out): def perform(self, node, inp, out):
...@@ -1516,9 +1518,9 @@ class CrossentropyCategorical1Hot(gof.Op): ...@@ -1516,9 +1518,9 @@ class CrossentropyCategorical1Hot(gof.Op):
crossentropy_categorical_1hot = CrossentropyCategorical1Hot() crossentropy_categorical_1hot = CrossentropyCategorical1Hot()
@opt.register_stabilize("fast_compile_gpu") @register_stabilize("fast_compile_gpu")
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.optimizer @optimizer
def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
""" """
This is a stabilization optimization. This is a stabilization optimization.
...@@ -1555,7 +1557,7 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph): ...@@ -1555,7 +1557,7 @@ def crossentropy_to_crossentropy_with_softmax_with_bias(fgraph):
return return
@gof.optimizer @optimizer
def crossentropy_to_crossentropy_with_softmax(fgraph): def crossentropy_to_crossentropy_with_softmax(fgraph):
""" """
This is a stabilization optimization that is more general than This is a stabilization optimization that is more general than
...@@ -1585,7 +1587,7 @@ def crossentropy_to_crossentropy_with_softmax(fgraph): ...@@ -1585,7 +1587,7 @@ def crossentropy_to_crossentropy_with_softmax(fgraph):
new_sm, new_sm,
new_am, new_am,
) = crossentropy_softmax_argmax_1hot_with_bias( ) = crossentropy_softmax_argmax_1hot_with_bias(
x, tensor.zeros_like(x[0]), one_of_n x, tt.zeros_like(x[0]), one_of_n
) )
fgraph.replace_all_validate( fgraph.replace_all_validate(
[(nll, new_nll), (sm, new_sm)], [(nll, new_nll), (sm, new_sm)],
...@@ -1622,10 +1624,10 @@ optdb.register( ...@@ -1622,10 +1624,10 @@ optdb.register(
) )
@opt.register_specialize( @register_specialize(
"fast_compile_gpu", "local_crossentropy_to_crossentropy_with_softmax_grad" "fast_compile_gpu", "local_crossentropy_to_crossentropy_with_softmax_grad"
) # old name ) # old name
@gof.local_optimizer([softmax_grad]) @local_optimizer([softmax_grad])
def local_softmax_grad_to_crossentropy_with_softmax_grad(node): def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
if node.op == softmax_grad: if node.op == softmax_grad:
g_coding_dist, coding_dist = node.inputs g_coding_dist, coding_dist = node.inputs
...@@ -1641,20 +1643,20 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node): ...@@ -1641,20 +1643,20 @@ def local_softmax_grad_to_crossentropy_with_softmax_grad(node):
return [dx] return [dx]
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.local_optimizer([tensor.MaxAndArgmax]) @local_optimizer([MaxAndArgmax])
def local_argmax_pushdown(node): def local_argmax_pushdown(node):
if ( if (
isinstance(node.op, tensor.MaxAndArgmax) isinstance(node.op, MaxAndArgmax)
and node.inputs[0].owner and node.inputs[0].owner
and len(node.outputs[0].clients) > 0 and len(node.outputs[0].clients) > 0
and node.inputs[0].owner.op and node.inputs[0].owner.op
in ( in (
softmax_op, softmax_op,
softplus, softplus,
tensor.exp, tt.exp,
tensor.log, log,
tensor.tanh, tt.tanh,
sigmoid, sigmoid,
softmax_with_bias, softmax_with_bias,
) )
...@@ -1671,7 +1673,7 @@ def local_argmax_pushdown(node): ...@@ -1671,7 +1673,7 @@ def local_argmax_pushdown(node):
) )
if ( if (
isinstance(node.op, tensor.MaxAndArgmax) isinstance(node.op, MaxAndArgmax)
and node.inputs[0].owner and node.inputs[0].owner
and len(node.outputs[0].clients) == 0 and len(node.outputs[0].clients) == 0
): ):
...@@ -1682,19 +1684,19 @@ def local_argmax_pushdown(node): ...@@ -1682,19 +1684,19 @@ def local_argmax_pushdown(node):
if x.owner and x.owner.op in ( if x.owner and x.owner.op in (
softmax_op, softmax_op,
softplus, softplus,
tensor.exp, tt.exp,
tensor.log, log,
tensor.tanh, tt.tanh,
sigmoid, sigmoid,
): ):
(pre_x,) = x.owner.inputs (pre_x,) = x.owner.inputs
ret = tensor.max_and_argmax(pre_x, axis) ret = tt.max_and_argmax(pre_x, axis)
copy_stack_trace(x_max, ret) copy_stack_trace(x_max, ret)
return ret return ret
if x.owner and x.owner.op == softmax_with_bias: if x.owner and x.owner.op == softmax_with_bias:
pre_x, pre_bias = x.owner.inputs pre_x, pre_bias = x.owner.inputs
ret = tensor.max_and_argmax( ret = tt.max_and_argmax(
pre_x + tensor.DimShuffle(pre_bias.broadcastable, ("x", 0))(pre_bias), pre_x + tt.DimShuffle(pre_bias.broadcastable, ("x", 0))(pre_bias),
axis, axis,
) )
# copy both stack traces # copy both stack traces
...@@ -1706,11 +1708,11 @@ def local_argmax_pushdown(node): ...@@ -1706,11 +1708,11 @@ def local_argmax_pushdown(node):
def _check_rows_is_arange_len_labels(rows, labels): def _check_rows_is_arange_len_labels(rows, labels):
""" """Check that `rows` is the same node as `tt.arange(labels.shape[0])`.
Check that 'rows' is the same node as T.arange(labels.shape[0]).
Also considers the case where labels.shape[0] is constant and equal Also considers the case where `labels.shape[0]` is constant and equal to 1,
to 1, and T.arange(labels.shape[0]) has been constant-folded into 0. and `tt.arange(labels.shape[0])` has been constant-folded into
0.
""" """
...@@ -1724,7 +1726,7 @@ def _check_rows_is_arange_len_labels(rows, labels): ...@@ -1724,7 +1726,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
if len(shape_of[labels]) == 1 and _is_const(shape_of[labels][0], 1): if len(shape_of[labels]) == 1 and _is_const(shape_of[labels][0], 1):
return _is_const(rows, 0) return _is_const(rows, 0)
if rows.owner and isinstance(rows.owner.op, tensor.ARange): if rows.owner and isinstance(rows.owner.op, tt.ARange):
start, stop, step = rows.owner.inputs start, stop, step = rows.owner.inputs
if getattr(start, "data", None) != 0: # constants will have data if getattr(start, "data", None) != 0: # constants will have data
return False return False
...@@ -1741,7 +1743,7 @@ def _check_rows_is_arange_len_labels(rows, labels): ...@@ -1741,7 +1743,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
shape_subtensor.inputs, allow_partial=True shape_subtensor.inputs, allow_partial=True
) == [0]: ) == [0]:
shape_var = shape_subtensor.inputs[0] shape_var = shape_subtensor.inputs[0]
if shape_var.owner and shape_var.owner.op == tensor.shape: if shape_var.owner and shape_var.owner.op == tt.shape:
return shape_var.owner.inputs[0] is labels return shape_var.owner.inputs[0] is labels
else: else:
shape_of = stop.owner.fgraph.shape_feature.shape_of shape_of = stop.owner.fgraph.shape_feature.shape_of
...@@ -1751,7 +1753,7 @@ def _check_rows_is_arange_len_labels(rows, labels): ...@@ -1751,7 +1753,7 @@ def _check_rows_is_arange_len_labels(rows, labels):
def _is_const(z, val, approx=False): def _is_const(z, val, approx=False):
try: try:
maybe = opt.get_scalar_constant_value(z) maybe = opt.get_scalar_constant_value(z)
except tensor.NotScalarConstantError: except tt.NotScalarConstantError:
return False return False
if approx: if approx:
return np.allclose(maybe, val) return np.allclose(maybe, val)
...@@ -1759,24 +1761,24 @@ def _is_const(z, val, approx=False): ...@@ -1759,24 +1761,24 @@ def _is_const(z, val, approx=False):
return np.all(maybe == val) return np.all(maybe == val)
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.local_optimizer([subtensor.AdvancedSubtensor, tensor.log]) @local_optimizer([AdvancedSubtensor, log])
def local_advanced_indexing_crossentropy_onehot(node): def local_advanced_indexing_crossentropy_onehot(node):
log = None log_op = None
sm = None sm = None
# First case: log(softmax(x))[rows, labels] # First case: log(softmax(x))[rows, labels]
if isinstance(node.op, subtensor.AdvancedSubtensor): if isinstance(node.op, AdvancedSubtensor):
try: try:
log, rows, labels = node.inputs log_op, rows, labels = node.inputs
except Exception: except Exception:
pass pass
if log and log.owner and log.owner.op == tensor.log: if log_op and log_op.owner and log_op.owner.op == log:
sm = log.owner.inputs[0] sm = log_op.owner.inputs[0]
# Second case: log(softmax(x)[rows, labels]) # Second case: log(softmax(x)[rows, labels])
elif node.op == tensor.log: elif node.op == log:
pre_log = node.inputs[0].owner pre_log = node.inputs[0].owner
if pre_log and isinstance(pre_log.op, subtensor.AdvancedSubtensor): if pre_log and isinstance(pre_log.op, AdvancedSubtensor):
try: try:
sm, rows, labels = pre_log.inputs sm, rows, labels = pre_log.inputs
except Exception: except Exception:
...@@ -1789,7 +1791,7 @@ def local_advanced_indexing_crossentropy_onehot(node): ...@@ -1789,7 +1791,7 @@ def local_advanced_indexing_crossentropy_onehot(node):
x_var, b_var = sm_w_bias[0].owner.inputs x_var, b_var = sm_w_bias[0].owner.inputs
else: else:
x_var = sm.owner.inputs[0] x_var = sm.owner.inputs[0]
b_var = tensor.zeros_like(x_var[0]) b_var = tt.zeros_like(x_var[0])
# Check that rows == arange(labels.shape[0]) # Check that rows == arange(labels.shape[0])
if _check_rows_is_arange_len_labels(rows, labels): if _check_rows_is_arange_len_labels(rows, labels):
...@@ -1802,8 +1804,8 @@ def local_advanced_indexing_crossentropy_onehot(node): ...@@ -1802,8 +1804,8 @@ def local_advanced_indexing_crossentropy_onehot(node):
return [ret] return [ret]
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.local_optimizer([softmax_grad]) @local_optimizer([softmax_grad])
def local_advanced_indexing_crossentropy_onehot_grad(node): def local_advanced_indexing_crossentropy_onehot_grad(node):
if not (node.op == softmax_grad): if not (node.op == softmax_grad):
return return
...@@ -1880,11 +1882,11 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1880,11 +1882,11 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# If there's a 'minus' sign before the whole expression, put it in # If there's a 'minus' sign before the whole expression, put it in
# out_grad and iterate # out_grad and iterate
if incr.owner and incr.owner.op == tensor.neg: if incr.owner and incr.owner.op == tt.neg:
out_grad = -out_grad out_grad = -out_grad
incr = incr.owner.inputs[0] incr = incr.owner.inputs[0]
if incr.owner and incr.owner.op == tensor.true_div: if incr.owner and incr.owner.op == tt.true_div:
num, denom = incr.owner.inputs num, denom = incr.owner.inputs
# set out_grad according to the numerator, it may be divided later # set out_grad according to the numerator, it may be divided later
...@@ -1897,24 +1899,22 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1897,24 +1899,22 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
if not denom.owner: if not denom.owner:
return return
if isinstance(denom.owner.op, subtensor.AdvancedSubtensor): if isinstance(denom.owner.op, AdvancedSubtensor):
# Base case # Base case
adv_subtensor = denom adv_subtensor = denom
# out_grad /= 1. # out_grad /= 1.
elif denom.owner.op == tensor.mul: elif denom.owner.op == tt.mul:
# Try to find the AdvancedSubtensor node mentionned above, # Try to find the AdvancedSubtensor node mentionned above,
# and the output gradient # and the output gradient
for i, input in enumerate(denom.owner.inputs): for i, input in enumerate(denom.owner.inputs):
if input.owner and isinstance( if input.owner and isinstance(input.owner.op, AdvancedSubtensor):
input.owner.op, subtensor.AdvancedSubtensor
):
other_inputs = [ other_inputs = [
in_ for (j, in_) in enumerate(denom.owner.inputs) if j != i in_ for (j, in_) in enumerate(denom.owner.inputs) if j != i
] ]
if len(other_inputs) == 1: if len(other_inputs) == 1:
rest = other_inputs[0] rest = other_inputs[0]
else: else:
rest = tensor.mul(*[other_inputs]) rest = tt.mul(*[other_inputs])
# Check that rest is a vector or a scalar # Check that rest is a vector or a scalar
if rest.ndim == 1 or np.all(rest.broadcastable): if rest.ndim == 1 or np.all(rest.broadcastable):
...@@ -1925,7 +1925,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1925,7 +1925,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
return return
# The output gradient needs to be a vector # The output gradient needs to be a vector
out_grad = tensor.fill(x_var[:, 0], out_grad) out_grad = tt.fill(x_var[:, 0], out_grad)
if adv_subtensor is not None: if adv_subtensor is not None:
try: try:
...@@ -1950,7 +1950,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1950,7 +1950,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# it was really case 1. # it was really case 1.
# Second case # Second case
elif d_sm.owner and d_sm.owner.op == tensor.true_div: elif d_sm.owner and d_sm.owner.op == tt.true_div:
# we're looking for # we're looking for
# AdvIncSubtensor(zeros, grad_nll, arange(len(y)), y) / softmax # AdvIncSubtensor(zeros, grad_nll, arange(len(y)), y) / softmax
try: try:
...@@ -1979,7 +1979,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -1979,7 +1979,7 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
# if the graph is valid, they have the same shape, so we # if the graph is valid, they have the same shape, so we
# also know that z has the right shape. # also know that z has the right shape.
if incr.ndim != 1 or incr.dtype not in tensor.float_dtypes: if incr.ndim != 1 or incr.dtype not in tt.float_dtypes:
return return
# here we know that we are incrementing some part of # here we know that we are incrementing some part of
...@@ -2018,8 +2018,8 @@ def local_advanced_indexing_crossentropy_onehot_grad(node): ...@@ -2018,8 +2018,8 @@ def local_advanced_indexing_crossentropy_onehot_grad(node):
return return
@opt.register_specialize("fast_compile_gpu") @register_specialize("fast_compile_gpu")
@gof.local_optimizer([softmax_with_bias]) @local_optimizer([softmax_with_bias])
def graph_merge_softmax_with_crossentropy_softmax(node): def graph_merge_softmax_with_crossentropy_softmax(node):
if node.op == softmax_with_bias: if node.op == softmax_with_bias:
x, b = node.inputs x, b = node.inputs
...@@ -2033,10 +2033,10 @@ def graph_merge_softmax_with_crossentropy_softmax(node): ...@@ -2033,10 +2033,10 @@ def graph_merge_softmax_with_crossentropy_softmax(node):
return [mergeable_client[1]] return [mergeable_client[1]]
@opt.register_specialize @register_specialize
@opt.register_stabilize @register_stabilize
@opt.register_canonicalize @register_canonicalize
@gof.local_optimizer([CrossentropySoftmax1HotWithBiasDx]) @local_optimizer([CrossentropySoftmax1HotWithBiasDx])
def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
""" """
Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is Replace a CrossentropySoftmax1HotWithBiasDx op, whose incoming gradient is
...@@ -2057,7 +2057,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): ...@@ -2057,7 +2057,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
assert dy.ndim == 1 assert dy.ndim == 1
if dy.owner is not None and isinstance(dy.owner.op, tensor.Alloc): if dy.owner is not None and isinstance(dy.owner.op, tt.Alloc):
# dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>) # dz is the input of the Alloc op, i.e. T.alloc(dz, <shape>)
dz = dy.owner.inputs[0] dz = dy.owner.inputs[0]
...@@ -2087,9 +2087,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node): ...@@ -2087,9 +2087,7 @@ def local_useless_crossentropy_softmax_1hot_with_bias_dx_alloc(node):
# If `dz` is broadcastable, we need to check whether the shapes # If `dz` is broadcastable, we need to check whether the shapes
# of `dy` and `sm` are the same or whether the shape of `dy` is # of `dy` and `sm` are the same or whether the shape of `dy` is
# equal to 1. # equal to 1.
cond = tensor.or_( cond = tt.or_(tt.eq(dy.shape[0], 1), tt.eq(dy.shape[0], sm.shape[0]))
tensor.eq(dy.shape[0], 1), tensor.eq(dy.shape[0], sm.shape[0])
)
msg = "`sm` and `dy` do not have the same shape." msg = "`sm` and `dy` do not have the same shape."
dz = opt.Assert(msg)(dz, cond) dz = opt.Assert(msg)(dz, cond)
...@@ -2115,7 +2113,7 @@ def binary_crossentropy(output, target): ...@@ -2115,7 +2113,7 @@ def binary_crossentropy(output, target):
TODO : Rewrite as a scalar, and then broadcast to tensor. TODO : Rewrite as a scalar, and then broadcast to tensor.
""" """
return -(target * tensor.log(output) + (1.0 - target) * tensor.log(1.0 - output)) return -(target * log(output) + (1.0 - target) * log(1.0 - output))
def sigmoid_binary_crossentropy(output, target): def sigmoid_binary_crossentropy(output, target):
...@@ -2193,16 +2191,14 @@ def categorical_crossentropy(coding_dist, true_dist): ...@@ -2193,16 +2191,14 @@ def categorical_crossentropy(coding_dist, true_dist):
""" """
if true_dist.ndim == coding_dist.ndim: if true_dist.ndim == coding_dist.ndim:
return -tensor.sum( return -tt.sum(true_dist * log(coding_dist), axis=coding_dist.ndim - 1)
true_dist * tensor.log(coding_dist), axis=coding_dist.ndim - 1
)
elif true_dist.ndim == coding_dist.ndim - 1: elif true_dist.ndim == coding_dist.ndim - 1:
return crossentropy_categorical_1hot(coding_dist, true_dist) return crossentropy_categorical_1hot(coding_dist, true_dist)
else: else:
raise TypeError("rank mismatch between coding and true distributions") raise TypeError("rank mismatch between coding and true distributions")
class Prepend_scalar_constant_to_each_row(gof.Op): class Prepend_scalar_constant_to_each_row(Op):
__props__ = () __props__ = ()
...@@ -2216,10 +2212,10 @@ class Prepend_scalar_constant_to_each_row(gof.Op): ...@@ -2216,10 +2212,10 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
def make_node(self, mat): def make_node(self, mat):
# check type of input # check type of input
x = tensor.as_tensor_variable(mat) x = tt.as_tensor_variable(mat)
if not mat.type.broadcastable == (False, False): if not mat.type.broadcastable == (False, False):
raise TypeError("Expected a matrix as input") raise TypeError("Expected a matrix as input")
y = tensor.as_tensor_variable(self.val) y = tt.as_tensor_variable(self.val)
assert y.ndim == 0 assert y.ndim == 0
if x.type.dtype != y.type.dtype: if x.type.dtype != y.type.dtype:
TypeError("the value to prepend don't have the same type as the matrix") TypeError("the value to prepend don't have the same type as the matrix")
...@@ -2255,18 +2251,18 @@ class Prepend_scalar_constant_to_each_row(gof.Op): ...@@ -2255,18 +2251,18 @@ class Prepend_scalar_constant_to_each_row(gof.Op):
return goutput[:, 1:] return goutput[:, 1:]
class Prepend_scalar_to_each_row(gof.Op): class Prepend_scalar_to_each_row(Op):
__props__ = () __props__ = ()
def make_node(self, val, mat): def make_node(self, val, mat):
# check type of input # check type of input
x = tensor.as_tensor_variable(mat) x = tt.as_tensor_variable(mat)
if isinstance(val, float): if isinstance(val, float):
val = scalar.constant(val) val = scalar.constant(val)
if not mat.type.broadcastable == (False, False): if not mat.type.broadcastable == (False, False):
raise TypeError("Expected a matrix as input") raise TypeError("Expected a matrix as input")
y = tensor.as_tensor_variable(val) y = tt.as_tensor_variable(val)
assert y.ndim == 0 assert y.ndim == 0
if x.type.dtype != y.type.dtype: if x.type.dtype != y.type.dtype:
TypeError("the value to prepend don't have the same type as the matrix") TypeError("the value to prepend don't have the same type as the matrix")
...@@ -2345,7 +2341,7 @@ def relu(x, alpha=0): ...@@ -2345,7 +2341,7 @@ def relu(x, alpha=0):
# We can't use 0.5 and 1 for one and half. as if alpha is a # We can't use 0.5 and 1 for one and half. as if alpha is a
# numpy dtype, they will be considered as float64, so would # numpy dtype, they will be considered as float64, so would
# cause upcast to float64. # cause upcast to float64.
alpha = tensor.as_tensor_variable(alpha) alpha = tt.as_tensor_variable(alpha)
f1 = 0.5 * (1 + alpha) f1 = 0.5 * (1 + alpha)
f2 = 0.5 * (1 - alpha) f2 = 0.5 * (1 - alpha)
return f1 * x + f2 * abs(x) return f1 * x + f2 * abs(x)
...@@ -2446,7 +2442,7 @@ def h_softmax( ...@@ -2446,7 +2442,7 @@ def h_softmax(
>>> import numpy as np >>> import numpy as np
>>> import theano >>> import theano
>>> from theano import tensor >>> import theano.tensor as tt
>>> from theano.tensor.nnet import h_softmax >>> from theano.tensor.nnet import h_softmax
>>> >>>
>>> # Parameters >>> # Parameters
...@@ -2472,15 +2468,15 @@ def h_softmax( ...@@ -2472,15 +2468,15 @@ def h_softmax(
>>> # We can now build the graph to compute a loss function, typically the >>> # We can now build the graph to compute a loss function, typically the
>>> # negative log-likelihood: >>> # negative log-likelihood:
>>> >>>
>>> x = tensor.imatrix('x') >>> x = tt.imatrix('x')
>>> target = tensor.imatrix('target') >>> target = tt.imatrix('target')
>>> >>>
>>> # This only computes the output corresponding to the target. >>> # This only computes the output corresponding to the target.
>>> # The complexity is O(n_classes + n_outputs_per_class). >>> # The complexity is O(n_classes + n_outputs_per_class).
>>> y_hat_tg = h_softmax(x, batch_size, output_size, n_classes, >>> y_hat_tg = h_softmax(x, batch_size, output_size, n_classes,
... n_outputs_per_class, W1, b1, W2, b2, target) ... n_outputs_per_class, W1, b1, W2, b2, target)
>>> >>>
>>> negll = -tensor.mean(tensor.log(y_hat_tg)) >>> negll = -tt.mean(tt.log(y_hat_tg))
>>> >>>
>>> # We may need to compute all the outputs (at test time usually): >>> # We may need to compute all the outputs (at test time usually):
>>> >>>
...@@ -2497,15 +2493,13 @@ def h_softmax( ...@@ -2497,15 +2493,13 @@ def h_softmax(
""" """
# First softmax that computes the probabilities of belonging to each class # First softmax that computes the probabilities of belonging to each class
class_probs = theano.tensor.nnet.softmax(tensor.dot(x, W1) + b1) class_probs = softmax(tt.dot(x, W1) + b1)
if target is None: # Computes the probabilites of all the outputs if target is None: # Computes the probabilites of all the outputs
# Second softmax that computes the output probabilities # Second softmax that computes the output probabilities
activations = tensor.tensordot(x, W2, (1, 1)) + b2 activations = tt.tensordot(x, W2, (1, 1)) + b2
output_probs = theano.tensor.nnet.softmax( output_probs = softmax(activations.reshape((-1, n_outputs_per_class)))
activations.reshape((-1, n_outputs_per_class))
)
output_probs = output_probs.reshape((batch_size, n_classes, -1)) output_probs = output_probs.reshape((batch_size, n_classes, -1))
output_probs = class_probs.dimshuffle(0, 1, "x") * output_probs output_probs = class_probs.dimshuffle(0, 1, "x") * output_probs
output_probs = output_probs.reshape((batch_size, -1)) output_probs = output_probs.reshape((batch_size, -1))
...@@ -2528,14 +2522,14 @@ def h_softmax( ...@@ -2528,14 +2522,14 @@ def h_softmax(
activations = sparse_block_dot( activations = sparse_block_dot(
W2.dimshuffle("x", 0, 1, 2), W2.dimshuffle("x", 0, 1, 2),
x.dimshuffle(0, "x", 1), x.dimshuffle(0, "x", 1),
tensor.zeros((batch_size, 1), dtype="int32"), tt.zeros((batch_size, 1), dtype="int32"),
b2, b2,
target_classes.dimshuffle(0, "x"), target_classes.dimshuffle(0, "x"),
) )
output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2)) output_probs = softmax(activations.dimshuffle(0, 2))
target_class_probs = class_probs[tensor.arange(batch_size), target_classes] target_class_probs = class_probs[tt.arange(batch_size), target_classes]
output_probs = output_probs[tensor.arange(batch_size), target_outputs_in_class] output_probs = output_probs[tt.arange(batch_size), target_outputs_in_class]
output_probs = target_class_probs * output_probs output_probs = target_class_probs * output_probs
return output_probs return output_probs
...@@ -2565,7 +2559,7 @@ def elu(x, alpha=1): ...@@ -2565,7 +2559,7 @@ def elu(x, alpha=1):
"Fast and Accurate Deep Network Learning by "Fast and Accurate Deep Network Learning by
Exponential Linear Units (ELUs)" <http://arxiv.org/abs/1511.07289>`. Exponential Linear Units (ELUs)" <http://arxiv.org/abs/1511.07289>`.
""" """
return tensor.switch(x > 0, x, alpha * tensor.expm1(x)) return tt.switch(x > 0, x, alpha * tt.expm1(x))
def selu(x): def selu(x):
...@@ -2593,7 +2587,7 @@ def selu(x): ...@@ -2593,7 +2587,7 @@ def selu(x):
return scale * elu(x, alpha) return scale * elu(x, alpha)
class ScalarSoftsign(theano.scalar.UnaryScalarOp): class ScalarSoftsign(UnaryScalarOp):
""" """
Softsign activation function Softsign activation function
:math:`\\varphi(\\mathbf{x}) = \\frac{1}{1+|x|}` :math:`\\varphi(\\mathbf{x}) = \\frac{1}{1+|x|}`
...@@ -2625,7 +2619,7 @@ class ScalarSoftsign(theano.scalar.UnaryScalarOp): ...@@ -2625,7 +2619,7 @@ class ScalarSoftsign(theano.scalar.UnaryScalarOp):
scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float, name="scalar_softsign") scalar_softsign = ScalarSoftsign(theano.scalar.upgrade_to_float, name="scalar_softsign")
softsign = elemwise.Elemwise(scalar_softsign, name="softsign") softsign = Elemwise(scalar_softsign, name="softsign")
def confusion_matrix(actual, pred): def confusion_matrix(actual, pred):
...@@ -2652,10 +2646,11 @@ def confusion_matrix(actual, pred): ...@@ -2652,10 +2646,11 @@ def confusion_matrix(actual, pred):
Examples Examples
-------- --------
>>> import theano >>> import theano
>>> import theano.tensor as tt
>>> from theano.tensor.nnet import confusion_matrix >>> from theano.tensor.nnet import confusion_matrix
>>> x = theano.tensor.vector() >>> x = tt.vector()
>>> y = theano.tensor.vector() >>> y = tt.vector()
>>> f = theano.function([x, y], confusion_matrix(x, y)) >>> f = theano.function([x, y], confusion_matrix(x, y))
>>> y_true = [2, 0, 2, 2, 0, 1] >>> y_true = [2, 0, 2, 2, 0, 1]
>>> y_pred = [0, 0, 2, 2, 0, 2] >>> y_pred = [0, 0, 2, 2, 0, 2]
...@@ -2669,13 +2664,13 @@ def confusion_matrix(actual, pred): ...@@ -2669,13 +2664,13 @@ def confusion_matrix(actual, pred):
if pred.ndim != 1: if pred.ndim != 1:
raise ValueError("pred must be 1-d tensor variable") raise ValueError("pred must be 1-d tensor variable")
order = extra_ops.Unique(False, False, False)(tensor.concatenate([actual, pred])) order = extra_ops.Unique(False, False, False)(tt.concatenate([actual, pred]))
colA = actual.dimshuffle(0, "x") colA = actual.dimshuffle(0, "x")
colP = pred.dimshuffle(0, "x") colP = pred.dimshuffle(0, "x")
oneHotA = tensor.eq(colA, order).astype("int64") oneHotA = tt.eq(colA, order).astype("int64")
oneHotP = tensor.eq(colP, order).astype("int64") oneHotP = tt.eq(colP, order).astype("int64")
conf_mat = tensor.dot(oneHotA.T, oneHotP) conf_mat = tt.dot(oneHotA.T, oneHotP)
return [conf_mat, order] return [conf_mat, order]
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论