""" dogleg algorithm with rectangular trust regions for least-squares minimization. The description of the algorithm can be found in [Voglis]_. The algorithm does trust-region iterations, but the shape of trust regions is rectangular as opposed to conventional elliptical. The intersection of a trust region and an initial feasible region is again some rectangle. Thus on each iteration a bound-constrained quadratic optimization problem is solved. A quadratic problem is solved by well-known dogleg approach, where the function is minimized along piecewise-linear "dogleg" path [NumOpt]_, Chapter 4. If Jacobian is not rank-deficient then the function is decreasing along this path, and optimization amounts to simply following along this path as long as a point stays within the bounds. A constrained Cauchy step (along the anti-gradient) is considered for safety in rank deficient cases, in this situations the convergence might be slow. If during iterations some variable hit the initial bound and the component of anti-gradient points outside the feasible region, then a next dogleg step won't make any progress. At this state such variables satisfy first-order optimality conditions and they are excluded before computing a next dogleg step. Gauss-Newton step can be computed exactly by `numpy.linalg.lstsq` (for dense Jacobian matrices) or by iterative procedure `scipy.sparse.linalg.lsmr` (for dense and sparse matrices, or Jacobian being LinearOperator). The second option allows to solve very large problems (up to couple of millions of residuals on a regular PC), provided the Jacobian matrix is sufficiently sparse. But note that dogbox is not very good for solving problems with large number of constraints, because of variables exclusion-inclusion on each iteration (a required number of function evaluations might be high or accuracy of a solution will be poor), thus its large-scale usage is probably limited to unconstrained problems. References ---------- .. [Voglis] C. Voglis and I. E. Lagaris, "A Rectangular Trust Region Dogleg Approach for Unconstrained and Bound Constrained Nonlinear Optimization", WSEAS International Conference on Applied Mathematics, Corfu, Greece, 2004. .. [NumOpt] J. Nocedal and S. J. Wright, "Numerical optimization, 2nd edition". """ from __future__ import division, print_function, absolute_import import numpy as np from numpy.linalg import lstsq, norm from scipy.sparse.linalg import LinearOperator, aslinearoperator, lsmr from scipy.optimize import OptimizeResult from scipy._lib.six import string_types from .common import ( step_size_to_bound, in_bounds, update_tr_radius, evaluate_quadratic, build_quadratic_1d, minimize_quadratic_1d, compute_grad, compute_jac_scale, check_termination, scale_for_robust_loss_function, print_header_nonlinear, print_iteration_nonlinear) def lsmr_operator(Jop, d, active_set): """Compute LinearOperator to use in LSMR by dogbox algorithm. `active_set` mask is used to excluded active variables from computations of matrix-vector products. """ m, n = Jop.shape def matvec(x): x_free = x.ravel().copy() x_free[active_set] = 0 return Jop.matvec(x * d) def rmatvec(x): r = d * Jop.rmatvec(x) r[active_set] = 0 return r return LinearOperator((m, n), matvec=matvec, rmatvec=rmatvec, dtype=float) def find_intersection(x, tr_bounds, lb, ub): """Find intersection of trust-region bounds and initial bounds. Returns ------- lb_total, ub_total : ndarray with shape of x Lower and upper bounds of the intersection region. orig_l, orig_u : ndarray of bool with shape of x True means that an original bound is taken as a corresponding bound in the intersection region. tr_l, tr_u : ndarray of bool with shape of x True means that a trust-region bound is taken as a corresponding bound in the intersection region. """ lb_centered = lb - x ub_centered = ub - x lb_total = np.maximum(lb_centered, -tr_bounds) ub_total = np.minimum(ub_centered, tr_bounds) orig_l = np.equal(lb_total, lb_centered) orig_u = np.equal(ub_total, ub_centered) tr_l = np.equal(lb_total, -tr_bounds) tr_u = np.equal(ub_total, tr_bounds) return lb_total, ub_total, orig_l, orig_u, tr_l, tr_u def dogleg_step(x, newton_step, g, a, b, tr_bounds, lb, ub): """Find dogleg step in a rectangular region. Returns ------- step : ndarray, shape (n,) Computed dogleg step. bound_hits : ndarray of int, shape (n,) Each component shows whether a corresponding variable hits the initial bound after the step is taken: * 0 - a variable doesn't hit the bound. * -1 - lower bound is hit. * 1 - upper bound is hit. tr_hit : bool Whether the step hit the boundary of the trust-region. """ lb_total, ub_total, orig_l, orig_u, tr_l, tr_u = find_intersection( x, tr_bounds, lb, ub ) bound_hits = np.zeros_like(x, dtype=int) if in_bounds(newton_step, lb_total, ub_total): return newton_step, bound_hits, False to_bounds, _ = step_size_to_bound(np.zeros_like(x), -g, lb_total, ub_total) # The classical dogleg algorithm would check if Cauchy step fits into # the bounds, and just return it constrained version if not. But in a # rectangular trust region it makes sense to try to improve constrained # Cauchy step too. Thus we don't distinguish these two cases. cauchy_step = -minimize_quadratic_1d(a, b, 0, to_bounds)[0] * g step_diff = newton_step - cauchy_step step_size, hits = step_size_to_bound(cauchy_step, step_diff, lb_total, ub_total) bound_hits[(hits < 0) & orig_l] = -1 bound_hits[(hits > 0) & orig_u] = 1 tr_hit = np.any((hits < 0) & tr_l | (hits > 0) & tr_u) return cauchy_step + step_size * step_diff, bound_hits, tr_hit def dogbox(fun, jac, x0, f0, J0, lb, ub, ftol, xtol, gtol, max_nfev, x_scale, loss_function, tr_solver, tr_options, verbose): f = f0 f_true = f.copy() nfev = 1 J = J0 njev = 1 if loss_function is not None: rho = loss_function(f) cost = 0.5 * np.sum(rho[0]) J, f = scale_for_robust_loss_function(J, f, rho) else: cost = 0.5 * np.dot(f, f) g = compute_grad(J, f) jac_scale = isinstance(x_scale, string_types) and x_scale == 'jac' if jac_scale: scale, scale_inv = compute_jac_scale(J) else: scale, scale_inv = x_scale, 1 / x_scale Delta = norm(x0 * scale_inv, ord=np.inf) if Delta == 0: Delta = 1.0 on_bound = np.zeros_like(x0, dtype=int) on_bound[np.equal(x0, lb)] = -1 on_bound[np.equal(x0, ub)] = 1 x = x0 step = np.empty_like(x0) if max_nfev is None: max_nfev = x0.size * 100 termination_status = None iteration = 0 step_norm = None actual_reduction = None if verbose == 2: print_header_nonlinear() while True: active_set = on_bound * g < 0 free_set = ~active_set g_free = g[free_set] g_full = g.copy() g[active_set] = 0 g_norm = norm(g, ord=np.inf) if g_norm < gtol: termination_status = 1 if verbose == 2: print_iteration_nonlinear(iteration, nfev, cost, actual_reduction, step_norm, g_norm) if termination_status is not None or nfev == max_nfev: break x_free = x[free_set] lb_free = lb[free_set] ub_free = ub[free_set] scale_free = scale[free_set] # Compute (Gauss-)Newton and build quadratic model for Cauchy step. if tr_solver == 'exact': J_free = J[:, free_set] newton_step = lstsq(J_free, -f)[0] # Coefficients for the quadratic model along the anti-gradient. a, b = build_quadratic_1d(J_free, g_free, -g_free) elif tr_solver == 'lsmr': Jop = aslinearoperator(J) # We compute lsmr step in scaled variables and then # transform back to normal variables, if lsmr would give exact lsq # solution this would be equivalent to not doing any # transformations, but from experience it's better this way. # We pass active_set to make computations as if we selected # the free subset of J columns, but without actually doing any # slicing, which is expensive for sparse matrices and impossible # for LinearOperator. lsmr_op = lsmr_operator(Jop, scale, active_set) newton_step = -lsmr(lsmr_op, f, **tr_options)[0][free_set] newton_step *= scale_free # Components of g for active variables were zeroed, so this call # is correct and equivalent to using J_free and g_free. a, b = build_quadratic_1d(Jop, g, -g) actual_reduction = -1.0 while actual_reduction <= 0 and nfev < max_nfev: tr_bounds = Delta * scale_free step_free, on_bound_free, tr_hit = dogleg_step( x_free, newton_step, g_free, a, b, tr_bounds, lb_free, ub_free) step.fill(0.0) step[free_set] = step_free if tr_solver == 'exact': predicted_reduction = -evaluate_quadratic(J_free, g_free, step_free) elif tr_solver == 'lsmr': predicted_reduction = -evaluate_quadratic(Jop, g, step) x_new = x + step f_new = fun(x_new) nfev += 1 step_h_norm = norm(step * scale_inv, ord=np.inf) if not np.all(np.isfinite(f_new)): Delta = 0.25 * step_h_norm continue # Usual trust-region step quality estimation. if loss_function is not None: cost_new = loss_function(f_new, cost_only=True) else: cost_new = 0.5 * np.dot(f_new, f_new) actual_reduction = cost - cost_new Delta, ratio = update_tr_radius( Delta, actual_reduction, predicted_reduction, step_h_norm, tr_hit ) step_norm = norm(step) termination_status = check_termination( actual_reduction, cost, step_norm, norm(x), ratio, ftol, xtol) if termination_status is not None: break if actual_reduction > 0: on_bound[free_set] = on_bound_free x = x_new # Set variables exactly at the boundary. mask = on_bound == -1 x[mask] = lb[mask] mask = on_bound == 1 x[mask] = ub[mask] f = f_new f_true = f.copy() cost = cost_new J = jac(x, f) njev += 1 if loss_function is not None: rho = loss_function(f) J, f = scale_for_robust_loss_function(J, f, rho) g = compute_grad(J, f) if jac_scale: scale, scale_inv = compute_jac_scale(J, scale_inv) else: step_norm = 0 actual_reduction = 0 iteration += 1 if termination_status is None: termination_status = 0 return OptimizeResult( x=x, cost=cost, fun=f_true, jac=J, grad=g_full, optimality=g_norm, active_mask=on_bound, nfev=nfev, njev=njev, status=termination_status)