Source code for ivy.chars.hrm

#!/usr/bin/env python
from __future__ import absolute_import, division, print_function, unicode_literals

import ivy
import numpy as np
import math
import types
from ivy.chars.expokit import cyexpokit
import scipy
from scipy import special
from scipy.optimize import minimize
from scipy.special import binom
import itertools
import random
import multiprocessing as mp
from functools import partial
import pickle

from ivy.chars.mk import *
from ivy.chars.mk_mr import *
from ivy.chars.hrm import *
from scipy import cluster
import nlopt
np.seterr(invalid="warn")
try:
    StringTypes = types.StringTypes # Python 2
except AttributeError: # Python 3
    StringTypes = [str]
"""
Functions for fitting an HRM model
"""

[docs]def hrm_mk(tree, chars, Q, nregime, pi="Equal",returnPi=False, ar=None): """ Return log-likelihood of hidden-rates-markov mk model as described in Beaulieu et al. 2013 Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence Q (np.array): Instantaneous rate matrix p (np.array): Optional pre-allocated p matrix pi (str or np.array): Option to weight the root node by given values. Either a string containing the method or an array of weights. Weights should be given in order. Accepted methods of weighting root: Equal: flat prior Equilibrium: Prior equal to stationary distribution of Q matrix Fitzjohn: Root states weighted by how well they explain the data at the tips. returnPi (bool): Whether or not to return the final values of root node weighting ar (dict): Dict of pre-allocated arrays to improve speed by avoiding creating and destroying new arrays Returns: float: Log-likelihood of model Examples: from ivy.examples import primates,primate_data Q = np.array([[-0.11,0.1 ,0.01 ,0 ], [0.1 ,-0.11,0.0 ,0.01 ], [0.01 ,0.0 ,-0.51,0.5 ], [0.0 ,0.01 ,0.5 ,-0.51]]) hrm_mk(primates,primate_data,Q,nregime=2) """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nchar = Q.shape[0] nobschar = nchar/nregime if ar is None: # Creating arrays to be used later ar = create_hrm_ar(tree, chars, nregime) # Calculating the likelihoods for each node in post-order sequence cyexpokit.dexpm_tree_preallocated_p_log(Q, ar["t"], ar["p"]) cyexpokit.cy_mk_log(ar["nodelist"], ar["p"], nchar, ar["tmp_ar"], ar["intnode_list"],ar["child_ar"]) # The last row of nodelist contains the likelihood values at the root # Applying the correct root prior if not type(pi) in StringTypes: assert len(pi) == nchar, "length of given pi does not match Q dimensions" assert str(type(pi)) == "<type 'numpy.ndarray'>", "pi must be str or numpy array" assert np.isclose(sum(pi), 1), "values of given pi must sum to 1" np.copyto(ar["root_priors"], pi) rootliks = [ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ] elif pi == "Equal": ar["root_priors"].fill(1.0/nchar) rootliks = [ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1])] elif pi == "Fitzjohn": np.copyto(ar["root_priors"], [ar["nodelist"][-1,:-1][charstate]- scipy.misc.logsumexp(ar["nodelist"][-1,:-1]) for charstate in range(nchar) ]) rootliks = [ ar["nodelist"][-1,:-1][charstate] + ar["root_priors"][charstate] for charstate in range(nchar) ] elif pi == "Equilibrium": # Equilibrium pi from the stationary distribution of Q np.copyto(ar["root_priors"],qsd(Q)) rootliks = [ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ] else: raise ValueError("invalid value for pi: {}".format(pi)) logli = scipy.misc.logsumexp(rootliks) if returnPi: return (logli, {k:v for k,v in enumerate(ar["root_priors"])}) else: return logli
def _random_Q_matrix(nobschar, nregime): """ Generate a random Q matrix with nchar*nregime rows and cols """ split = 2.0/nregime bins = [ split*r for r in range(nregime) ] Q = np.zeros([nobschar*nregime, nobschar*nregime]) for rC in range(nregime): for charC in range(nobschar): for rR in range(nregime): for charR in range(nobschar): if not ((rR == rC) and (charR == charC)): if ((rR == rC) or ((charR == charC)) and (rR+1 == rC or rR-1 == rC)): Q[charR+rR*nobschar, charC+rC*nobschar] = random.uniform(bins[rR], bins[rR]+split) Q[np.diag_indices(nobschar*nregime)] = np.sum(Q, axis=1)*-1 return Q
[docs]def fill_Q_matrix(nobschar, nregime, wrparams, brparams, Qtype="ARD", out=None, orderedRegimes = True): """ Fill a Q matrix with nchar*nregime rows and cols with values from Qparams Args: nchar (int): number of observed characters nregime (int): number of hidden rates per character wrparams (list): List of unique Q values for within-regime transitions, in order as they appear in columnwise iteration brparams (list): list of unique Q values for between-regime transition, in order as they appear in columnwise iteration Qtype (str): Either "ARD" or "Simple". What type of Q matrix to fill. out (np.array): Optional numpy array to put output into orderedRegimes (bool): Whether or not regimes are ordered such that a transition from 1 -> 3 is not possible (a branch in state 1 must pass through 2 to get to 3) Returns: array: Q-matrix with values filled in. Check to make sure values have been filled in properly """ wrparams = list(wrparams) brparams = list(brparams) if out is None: Q = np.zeros([nobschar*nregime, nobschar*nregime]) else: Q = out # TODO: DRY this (and fill_model_Q) assert Qtype in ["ARD", "Simple"] if orderedRegimes: grid = np.zeros([(nobschar*nregime)**2, 4], dtype=int) grid[:,0] = np.tile(np.repeat(list(range(nregime)), nobschar), nobschar*nregime) grid[:,1] = np.repeat(list(range(nregime)), nregime*nobschar**2) grid[:,2] = np.tile(list(range(nobschar)), nregime**2*nobschar) grid[:,3] = np.tile(np.repeat(list(range(nobschar)), nregime*nobschar), nregime) if Qtype == "ARD": wrcount = 0 brcount = 0 for i, qcell in enumerate(np.nditer(Q, order="F", op_flags=["readwrite"])): cell = grid[i] if (cell[0] == cell[1]) and cell[2] != cell[3]: qcell[...] = wrparams[wrcount] wrcount += 1 elif(cell[0] in [cell[1]+1, cell[1]-1] and cell[2] == cell[3] ): qcell[...] = brparams[brcount] brcount += 1 elif Qtype == "Simple": for i,qcell in enumerate(np.nditer(Q, order="F", op_flags=["readwrite"])): cell = grid[i] if (cell[0] == cell[1]) and cell[2] != cell[3]: qcell[...] = wrparams[cell[0]] elif(cell[0] in [cell[1]+1, cell[1]-1] and cell[2] == cell[3] ): qcell[...] = brparams[0] else: n_wr = nobschar**2-nobschar if Qtype == "ARD": # Within-regime for i,wr in enumerate([wrparams[i:i+n_wr] for i in range(0, len(wrparams), n_wr)]): subQ = slice(i*nobschar,(i+1)*nobschar) wrVals = [x for s in [[0]+wr[k:k+nobschar] for k in range(0, len(wr)+1, nobschar)] for x in s] np.copyto(Q[subQ,subQ], [wrVals[x:x+nobschar] for x in range(0, len(wrVals), nobschar)]) # between regime combs = list(itertools.combinations(list(range(nregime)),2)) revcombs = [tuple(reversed(i)) for i in combs] submatrix_indices = [x for s in [[combs[i]] + [revcombs[i]] for i in range(len(combs))] for x in s] for i,submatrix_index in enumerate(submatrix_indices): my_slice0 = slice(submatrix_index[0]*nobschar, (submatrix_index[0]+1)*nobschar) my_slice1 = slice(submatrix_index[1]*nobschar, (submatrix_index[1]+1)*nobschar) nregimeswitch = (nregime**2 - nregime)*nobschar np.fill_diagonal(Q[my_slice0,my_slice1],[p for p in brparams[i*nobschar:i*nobschar+nobschar]]) elif Qtype == "Simple": #Within-regime for i,wr_p in enumerate(wrparams): wr = [wr_p] * n_wr subQ = slice(i*nobschar,(i+1)*nobschar) wrVals = [x for s in [[0]+wr[k:k+nobschar] for k in range(0, len(wr)+1, nobschar)] for x in s] np.copyto(Q[subQ,subQ], [wrVals[x:x+nobschar] for x in range(0, len(wrVals), nobschar)]) # between-regime combs = list(itertools.combinations(list(range(nregime)),2)) revcombs = [tuple(reversed(i)) for i in combs] submatrix_indices = [x for s in [[combs[i]] + [revcombs[i]] for i in range(len(combs))] for x in s] for i,submatrix_index in enumerate(submatrix_indices): my_slice0 = slice(submatrix_index[0]*nobschar, (submatrix_index[0]+1)*nobschar) my_slice1 = slice(submatrix_index[1]*nobschar, (submatrix_index[1]+1)*nobschar) nregimeswitch = (nregime**2 - nregime)*nobschar np.fill_diagonal(Q[my_slice0,my_slice1],brparams) Q[np.diag_indices(nobschar*nregime)] = (np.sum(Q, axis=1) * -1) if out is None: return Q
[docs]def create_hrm_ar(tree, chars, nregime, findmin=True): """ Create arrays to be used in hrm likelihood function """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] for n in tree: n.cladesize = len(n) nchar = len(set(chars)) * nregime nt = len(tree.descendants()) charlist = list(range(nchar)) nobschar = len(set(chars)) t = np.array([node.length for node in tree.postiter() if not node.isroot], dtype=np.double) nchar = len(set(chars)) * nregime nobschar = len(set(chars)) preleaves = [ n for n in tree.preiter() if n.isleaf ] postleaves = [n for n in tree.postiter() if n.isleaf ] postnodes = list(tree.postiter()) postChars = [ chars[i] for i in [ preleaves.index(n) for n in postleaves ] ] nnode = len(t)+1 nodelist = np.zeros((nnode, nchar+1)) nodelist.fill(-np.inf) # Fill initial likelihoods with log(0) (-inf) leafind = [ n.isleaf for n in tree.postiter()] tmp_ar = np.zeros([nchar]) # For storing calculations in cython code for k,ch in enumerate(postChars): hiddenChs = [y + ch for y in [x * nobschar for x in range(nregime) ]] [ n for i,n in enumerate(nodelist) if leafind[i] ][k][hiddenChs] = np.log(1.0) for i,n in enumerate(nodelist[:-1]): n[nchar] = postnodes.index(postnodes[i].parent) # Setting initial node likelihoods to log one for calculations nodelist[[ i for i,b in enumerate(leafind) if not b],:-1] = np.log(1.0) # Empty Q matrix Q = np.zeros([nchar,nchar], dtype=np.double) # Empty p matrix p = np.empty([nt, nchar, nchar], dtype = np.double, order="C") nodelistOrig = nodelist.copy() # Second copy to refer back to # Empty root prior array rootpriors = np.empty([nchar], dtype=np.double) if findmin: nullval = np.inf else: nullval = -np.inf # Upper bounds treelen = sum([ n.length for n in tree.leaves()[0].rootpath() if n.length]+[ tree.leaves()[0].length]) upperbound = len(tree.leaves())/treelen max_children = max(len(n.children) for n in tree) child_ar = np.empty([tree.cladesize,max_children], dtype=np.int64) child_ar.fill(-1) intnode_list = np.array(sorted(set(nodelist[:-1,nchar])),dtype=int) for intnode in intnode_list: children = np.where(nodelist[:,nchar]==intnode)[0] child_ar[int(intnode)][:len(children)] = children # Giving internal function access to these arrays. # Warning: can be tricky # Need to make sure old values # Aren't accidentally re-used var = {"Q": Q, "p": p, "t":t, "nodelist":nodelist, "charlist":charlist, "nodelistOrig":nodelistOrig, "upperbound":upperbound, "root_priors":rootpriors, "nullval":nullval, "tmp_ar":tmp_ar, "intnode_list":intnode_list,"child_ar":child_ar} return var
[docs]def create_likelihood_function_hrm_mk_MLE(tree, chars, nregime, Qtype, pi="Equal", findmin = True, constraints = "Rate", orderedRegimes = True): """ Create a function that takes values for Q and returns likelihood. Specify the Q to be ER, Sym, or ARD Returned function to be passed into scipy.optimize Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence Qtype (str): ARD only pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. min (bool): Whether the function is to be minimized (False means it will be maximized) orderedRegimes (bool): Whether or not regimes are ordered such that a transition from 1 -> 3 is not possible (a branch in state 1 must pass through 2 to get to 3) Notes: Constraints: The maximum rate within each rate category must be ordered ( fastest rate in slower regime must be slower than fastest rate in fastest regime) Returns: function: Function accepting a list of parameters and returning log-likelihood. To be optmimized with scipy.optimize.minimize """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nchar = len(set(chars)) * nregime nt = len(tree.descendants()) charlist = list(range(nchar)) nobschar = len(set(chars)) var = create_hrm_ar(tree, chars, nregime,findmin) def likelihood_function(Qparams, grad=None): """ NLOPT inputs the parameter array as well as a gradient object. The gradient object is ignored for this optimizer (LN_SBPLX) """ if any(np.isnan(Qparams)): return var["nullval"] # Enforcing upper bound on parameters if (sum(Qparams) > (var["upperbound"]*2)) or any(Qparams <= 0): return var["nullval"] # Filling Q matrices if Qtype == "ARD": var["Q"].fill(0.0) # Re-filling with zeroes wr = ((nobschar**2-nobschar)*nregime) fill_Q_matrix(nobschar, nregime, Qparams[:wr], Qparams[wr:],Qtype="ARD", out=var["Q"], orderedRegimes=orderedRegimes) if constraints == "Rate": qmax = [max(Qparams[i:i+nobschar]) for i in range(0, int(len(Qparams)/2), nobschar)] if sorted(qmax) != qmax: return var["nullval"] elif constraints == "Symmetry": regimes = extract_regimes(var["Q"], nobschar, nregime) for i in range(nregime): regimes[i] = regimes[i].argsort(axis=None).argsort().reshape(nobschar, nobschar) # Check if any regimes have identical ordering if any_equal(regimes): return var["nullval"] elif constraints == "corHMM": assert len(Qparams) == 8 if not Qparams[1] < Qparams[3]: return var["nullval"] elif Qtype == "Simple": var["Q"].fill(0.0) fill_Q_matrix(nobschar, nregime, Qparams[:nregime], Qparams[nregime:], Qtype="Simple", out = var["Q"], orderedRegimes=orderedRegimes) if any(sorted(Qparams[:-1]) != Qparams[:-1]): return var["nullval"] if Qparams[-2] < Qparams[-1]: return var["nullval"] # Filling Q matrices: else: raise ValueError("Qtype must be ARD or Simple") # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: logli = hrm_mk(tree, chars, var["Q"],nregime, pi = pi, ar=var) if not np.isnan(logli): return x * logli# Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def any_equal(ar): """ Given a 3-D matrix, test if any of the sub-matrices are equal. For use in evaluating symmetric constraint in hrm likelihood function """ combs = list(itertools.combinations(list(range(len(ar))),2)) for c in combs: if (ar[c[0]]==ar[c[1]]).all(): return True return False
[docs]def extract_regimes(Q, nobschar, nregime): """ Given a HRM Q matrix,extract the within-regime transitions """ out = np.zeros([nobschar,nobschar,nregime]) for i in range(nregime): subQ = slice(i*nobschar,(i+1)*nobschar) mask = np.ones([nobschar,nobschar], dtype=bool) mask[np.diag_indices(nobschar)]=False np.copyto(out[i],Q[subQ, subQ]) return out
[docs]def fit_hrm_qidx(tree, chars, nregime, qidx, pi="Equal", orderedRegimes=True, startingvals=None): """ Fit a hidden-rates mk model to a given tree and list of characters, and number of regumes. Return fitted ARD Q matrix and calculated likelihood. Args: tree (Node): Root node of a tree. All branch lengths must be chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of hidden rates per character pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. Defaults to "Equal" Method "Fitzjohn" is not thouroughly tested, use with caution orderedRegimes (bool): Whether or not to constrain regime transitions to only occur between adjacent regimes Returns: dict: Dict of fitted Q matrix (a np array) and log-likelihood value """ if type(chars) == dict: data = chars chars = [chars[l] for l in [n.label for n in tree.leaves()]] else: data = dict(zip([n.label for n in tree.leaves()],chars)) nchar = len(set(chars))*nregime nobschar = len(set(chars)) mk_func = cyexpokit.make_hrmlnl_func(tree, data,k=nchar,nq=nregime, qidx=qidx,findmin=True) nparam = len(set([n[-1] for n in qidx])) if startingvals is None: x0 = [0.1]*nparam else: x0 = startingvals opt = nlopt.opt(nlopt.LN_SBPLX, nparam) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) optim = opt.optimize(x0) wr = (nobschar**2-nobschar)*nregime logli = mk_func(optim[:], None) q = np.asarray(mk_func.q)[0] return {"Log-likelihood":-1*float(logli), "Q":q}
[docs]def fit_hrm_randstartingpoints(tree,chars,nregime,nstart=10,pi="Equal",constraints="Rate", Qtype="ARD",orderedRegimes=True): """ Fit a hidden-rates mk model to a given tree and list of characters, and number of regumes. Return fitted ARD Q matrix and calculated likelihood. Run nstart times, each with different randomized starting values, and return the highest likelihood Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of hidden rates per character nstart (int): Number of times to run likelihood fit pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. Defaults to "Equal" Method "Fitzjohn" is not thouroughly tested, use with caution orderedRegimes (bool): Whether or not to constrain regime transitions to only occur between adjacent regimes Returns: tuple: Tuple of fitted Q matrix (a np array) and log-likelihood value """ bestli = -np.inf if type(chars) == dict: nobschar = len(set(chars.values())) else: nobschar = len(set(chars)) if Qtype=="ARD": if orderedRegimes: nparam = ((nobschar**2-nobschar)*nregime) + (nregime-1)*2*nobschar else: nparam = ((nobschar**2-nobschar)*nregime + (nregime**2-nregime)*nobschar) else: nparam = nregime+1 for _ in range(nstart): while 1: startingvals = np.random.uniform(0,1,nparam) out = fit_hrm(tree,chars,nregime,pi,constraints,Qtype,orderedRegimes,startingvals=startingvals) if out["Log-likelihood"] != -np.inf: break if out["Log-likelihood"] > bestli: bestli = out["Log-likelihood"] best = out return best
[docs]def fit_hrm(tree, chars, nregime, pi="Equal", constraints="Rate", Qtype="ARD", orderedRegimes=True, startingvals=None): """ Fit a hidden-rates mk model to a given tree and list of characters, and number of regumes. Return fitted ARD Q matrix and calculated likelihood. Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of hidden rates per character pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. Defaults to "Equal" Method "Fitzjohn" is not thouroughly tested, use with caution orderedRegimes (bool): Whether or not to constrain regime transitions to only occur between adjacent regimes startingvals (list): Values to initialize optimization at. Optional Returns: tuple: Tuple of fitted Q matrix (a np array) and log-likelihood value """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] if Qtype == "Simple": Q, logli, rootLiks, f, par = fit_hrm_mkSimple(tree, chars, nregime, pi, orderedRegimes,startingvals) elif Qtype == "ARD": Q, logli, rootLiks, f, par = fit_hrm_mkARD(tree, chars, nregime, pi, constraints, orderedRegimes,startingvals) else: raise TypeError("Qtype must be Simple or ARD") return {"Q":Q, "Log-likelihood":logli,"rootLiks":rootLiks}
[docs]def fit_hrm_mkARD(tree, chars, nregime, pi="Equal", constraints="Rate", orderedRegimes=True, startingvals=None): """ Fit a hidden-rates mk model to a given tree and list of characters, and number of regumes. Return fitted ARD Q matrix and calculated likelihood. Args: tree (Node): Root node of a tree. All branch lengths must be chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of hidden rates per character pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. Defaults to "Equal" Method "Fitzjohn" is not thouroughly tested, use with caution orderedRegimes (bool): Whether or not to constrain regime transitions to only occur between adjacent regimes Returns: tuple: Tuple of fitted Q matrix (a np array) and log-likelihood value """ if type(chars) == dict: data = chars chars = [chars[l] for l in [n.label for n in tree.leaves()]] else: data = dict(zip([n.label for n in tree.leaves()],chars)) nchar = len(set(chars))*nregime nobschar = len(set(chars)) mk_func = create_likelihood_function_hrm_mk_MLE(tree, chars, nregime=nregime, Qtype="ARD", pi=pi, constraints=constraints, orderedRegimes=orderedRegimes) if not orderedRegimes: ncell = ((nobschar**2-nobschar)*nregime + (nregime**2-nregime)*nobschar) else: ncell = ((nobschar**2-nobschar)*nregime) + (nregime-1)*2*nobschar if startingvals is None: x0 = [0.1]*ncell else: x0 = startingvals opt = nlopt.opt(nlopt.LN_SBPLX, ncell) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) optim = opt.optimize(x0) wr = (nobschar**2-nobschar)*nregime q = fill_Q_matrix(nobschar, nregime, optim[:wr], optim[wr:],"ARD", orderedRegimes=orderedRegimes) piRates = hrm_mk(tree, chars, q, nregime, pi=pi, returnPi=True)[1] return (q, -1*float(mk_func(optim, None)), piRates, mk_func, optim)
[docs]def fit_hrm_mkSimple(tree, chars, nregime, pi="Equal", orderedRegimes=True, startingvals=None): """ Fit a hidden-rates mk model to a given tree and list of characters, and number of regumes. Return fitted ARD Q matrix and calculated likelihood. Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of hidden rates per character pi (str): Either "Equal", "Equilibrium", or "Fitzjohn". How to weight values at root node. Defaults to "Equal" Method "Fitzjohn" is not thouroughly tested, use with caution Returns: tuple: Tuple of fitted Q matrix (a np array) and log-likelihood value """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nchar = len(set(chars))*nregime nobschar = len(set(chars)) mk_func = create_likelihood_function_hrm_mk_MLE(tree, chars, nregime=nregime, Qtype="Simple", pi=pi) if startingvals is None: x0 = [0.1]*(nregime+1) else: x0 = startingvals opt = nlopt.opt(nlopt.LN_SBPLX, nregime+1) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) optim = opt.optimize(x0) q = fill_Q_matrix(nobschar, nregime, optim[:-1], [optim[-1]],"Simple") piRates = hrm_mk(tree, chars, q, nregime, pi=pi, returnPi=True)[1] return (q, -1*float(mk_func(optim, None)), piRates, mk_func, optim)
[docs]def make_regime_type_combos(nregime, nparams): """ Create regime combinations for a binary character """ paramRanks = list(range(nparams+1)) paramPairs = list(itertools.permutations(paramRanks,2)) for p in paramRanks: paramPairs.append((p,p)) regimePairs = list(itertools.combinations(paramPairs,nregime)) return regimePairs
[docs]def remove_redundant_regimes(regimePairs): """ Remove redundant regime pairs (eg (1,1),(2,2) and (2,2),(3,3) are redundant and only one should be kept) """ regime_identities = [make_identity_regime(r) for r in regimePairs] return [regimePairs[regimePairs.index(i)] for i in set(regime_identities)]
[docs]def make_identity_regime(regimePair): """ Given one regime pair, reduce it to its identity (eg (2,2),(3,3) becomes (1,1),(2,2) """ params_present = list(set([i for s in regimePair for i in s])) if 0 in params_present: identity_params = list(range(len(params_present))) else: identity_params = [i+1 for i in range(len(params_present))] identity_regime = tuple([tuple([identity_params[params_present.index(i)] for i in r]) for r in regimePair]) return identity_regime
[docs]def optimize_regime(comb, tree=None, chars=None, nregime=None, nparams=None, pi=None, br_variable=None, out_file=None, ar=None): """ Top-level function for optimizing hrm distinct-regime model in parallel. """ mk_func = hrm_disctinct_regimes_likelihoodfunc(tree, chars, comb, pi=pi, findmin = True, br_variable=br_variable, ar=ar) if not br_variable: nfreeparams = nparams+1 else: nfreeparams = nparams + (nregime**2 - nregime)*2 x0 = [0.1] * nfreeparams opt = nlopt.opt(nlopt.LN_SBPLX, nfreeparams) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) pars = opt.optimize(x0) lik = mk_func(pars) Q = fill_distinct_regime_Q(comb, np.insert(pars, 0, 1e-15),nregime,2,br_variable=br_variable) if out_file is not None: with open(out_file+str(comb)+".p", "wb") as f: pickle.dump((comb, lik, Q), f) return comb,lik,Q
[docs]def fit_hrm_distinct_regimes(tree, chars, nregime, nparams, pi="Equal", br_variable=False, parallel=False, ncores=2, out_file=None): """ Fit hrm with distinct regime types given number of regimes and number of parameters BINARY CHARACTERS ONLY Args: tree (Node): Root node of tree chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence nregime (int): Number of regimes to test in model nparams (int): Number of unique parameters available for regimes pi (str): Root prior br_variable (bool): Whether or not between-regime rates are allowed to vary parallel (bool): Whether to run in parallel ncores (int): If parallel is True, number of cores to run on out_file (str): Optional output file name. If given, results will written into the current directory with this string as the first part of the filename """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nchar = len(set(chars)) if nchar != 2: raise ValueError("Binary characters only. Number of states given:{}".format(nchar)) regime_combinations = make_regime_type_combos(nregime, nparams) regime_combinations = remove_redundant_regimes(regime_combinations) pars = [None] * len(regime_combinations) Qs = [np.zeros([4,4])] * len(regime_combinations) liks = [None] * len(regime_combinations) ncomb = len(regime_combinations) ar = create_hrm_ar(tree, chars, nregime) print("Testing {} regimes".format(ncomb)) if parallel: pool = mp.Pool(processes = ncores) func = partial(optimize_regime, tree=tree, chars=chars, nregime=nregime,nparams=nparams,pi=pi, br_variable=br_variable, ar=ar, out_file=out_file) results = pool.map(func, regime_combinations) out = results return {r[0]:(r[1],r[2]) for r in out} else: for i, comb in enumerate(regime_combinations): mk_func = hrm_disctinct_regimes_likelihoodfunc(tree, chars, comb, pi=pi, findmin = True, br_variable=br_variable, ar=ar) if not br_variable: nfreeparams = nparams+1 else: nfreeparams = nparams + (nregime**2 - nregime)*2 x0 = [0.1] * nfreeparams opt = nlopt.opt(nlopt.LN_SBPLX, nfreeparams) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) pars[i] = opt.optimize(x0) liks[i] = -mk_func(pars[i]) Qs[i] = fill_distinct_regime_Q(comb, np.insert(pars[i],0,1e-15), nregime, nchar, br_variable=br_variable) if out_file is not None: with open(out_file+str(comb)+".p", "wb") as f: pickle.dump((comb, liks[i], Qs[i]), f) print("{}/{} regimes tested".format(i+1,ncomb)) return {r:(liks[i], Qs[i]) for i,r in enumerate(regime_combinations)}
[docs]def fill_Q_layout(regimetype, Qparams): """ Fill a single regime matrix based on the regime type and the given parameters. """ Q = np.array([[0,Qparams[regimetype[0]]], [Qparams[regimetype[1]],0]]) return Q
[docs]def hrm_disctinct_regimes_likelihoodfunc(tree, chars, regimetypes, pi="Equal", findmin=True, br_variable=False, ar=None): if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nregime = len(regimetypes) nchar = len(set(chars)) * nregime nt = len(tree.descendants()) charlist = list(range(nchar)) nobschar = len(set(chars)) nregimeshift = (nregime**2 - nregime)*2 if not br_variable: nregimeshift = 1 if ar is None: var = create_hrm_ar(tree, chars, nregime, findmin) else: var = ar.copy() var["Q_layout"] = np.zeros([nregime,nobschar,nobschar]) def likelihood_function(Qparams, grad=None): """ NLOPT inputs the parameter array as well as a gradient object. The gradient object is ignored for this optimizer (LN_SBPLX) """ if any(np.isnan(Qparams)): return var["nullval"] # Enforcing upper bound on parameters if (sum(Qparams) > (var["upperbound"]*2)) or any(Qparams <= 0): return var["nullval"] if any(sorted(Qparams[:-nregimeshift])!=Qparams[:-nregimeshift]): return var["nullval"] if any(Qparams[-nregimeshift:]>Qparams[-(nregimeshift+1)]): return var["nullval"] Qparams = np.insert(Qparams, 0, 1e-15) fill_distinct_regime_Q(regimetypes,Qparams, nregime,nobschar,var["Q"], var["Q_layout"], br_variable) # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: logli = hrm_mk(tree, chars, var["Q"],nregime, pi = pi, ar=var) if not np.isnan(logli): return x * logli# Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def fill_distinct_regime_Q(regimetypes, Qparams,nregime, nobschar, Q = None, Q_layout = None, br_variable=False): returnQ = False if Q is None: returnQ = True Q = np.zeros([nobschar*nregime,nobschar*nregime]) Q_layout = np.zeros([nregime,nobschar,nobschar]) for i,rtype in enumerate(regimetypes): Q_layout[i] = fill_Q_layout(rtype, Qparams) # Filling regime sub-Qs within Q matrix: for i,regime in enumerate(Q_layout): subQ = slice(i*nobschar,(i+1)*nobschar) Q[subQ, subQ] = regime # Filling in between-regime values for i,submatrix_index in enumerate(itertools.permutations(list(range(nregime)),2)): my_slice0 = slice(submatrix_index[0]*nobschar, (submatrix_index[0]+1)*nobschar) my_slice1 = slice(submatrix_index[1]*nobschar, (submatrix_index[1]+1)*nobschar) if not br_variable: np.fill_diagonal(Q[my_slice0,my_slice1],Qparams[-1]) else: nregimeswitch =(nregime**2 - nregime)*2 np.fill_diagonal(Q[my_slice0,my_slice1],Qparams[(-nregimeswitch+i*2):]) np.fill_diagonal(Q, -np.sum(Q,1)) if returnQ: return Q
[docs]def fit_hrm_model(tree, chars, nregime, mod, pi="Equal", findmin=True, initialvals = None): """ Fit parameters for a single model specified by the user """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nobschar = len(set(chars)) if nobschar != 2: raise ValueError("Binary characters only. Number of states given:{}".format(nchar)) nchar = nobschar * nregime ar = create_hrm_ar(tree, chars, nregime) nfreeparams = len(set([i for i in mod if i != 0])) n_wr = nobschar**2 - nobschar mod_format = format_mod(mod, nregime, nobschar) mk_func = fit_hrm_model_likelihood(tree, chars, nregime, mod_format, pi, findmin) if initialvals is None: x0 = [0.1]*nfreeparams else: x0 = initialvals opt = nlopt.opt(nlopt.LN_SBPLX, nfreeparams) opt.set_min_objective(mk_func) opt.set_lower_bounds(0) par = opt.optimize(x0) lik = -mk_func(par) Q = np.zeros([nchar, nchar]) fill_model_Q(mod_format, np.insert(par, 0, 1e-15), Q) return Q, lik
[docs]def fill_model_Q(mod, Qparams, Q): """ Fill Q matrix given model and parameters. Args: mod (list): List of tuples. Code for the mode used. The first nregime tuples correspond to the within-regime transition rates for each regime. The last tuple corresponds to the between-regime transition rates. Ex. [(1, 2), (3, 4), (5, 6, 7, 8)] corresponds to a matrix of: [-,1,5,-] [2,-,-,6] [7,-,-,3] [-,8,4,-] Qparams (list): List of floats corresponding to the values for slow, medium, fast, etc. rates. The first is always 0 Q (np.array): Pre-allocated Q matrix """ nregime = len(mod)-1 nobschar = Q.shape[0]/nregime nchar = Q.shape[0] Q.fill(0.0) for i in range(nregime): subQ = slice(i*nobschar,(i+1)*nobschar) subQvals = [Qparams[x] for s in [(0,)+mod[i][k:k+nobschar] for k in range(0,len(mod[i])+1,nobschar)] for x in s] np.copyto(Q[subQ, subQ], [subQvals[x:x+nobschar] for x in range(0, len(subQvals), nobschar)]) combs = list(itertools.combinations(list(range(nregime)),2)) revcombs = [tuple(reversed(i)) for i in combs] submatrix_indices = [x for s in [[combs[i]] + [revcombs[i]] for i in range(len(combs))] for x in s] for i,submatrix_index in enumerate(submatrix_indices): my_slice0 = slice(submatrix_index[0]*nobschar, (submatrix_index[0]+1)*nobschar) my_slice1 = slice(submatrix_index[1]*nobschar, (submatrix_index[1]+1)*nobschar) nregimeswitch =(nregime**2 - nregime)*2 np.fill_diagonal(Q[my_slice0,my_slice1],[Qparams[p] for p in mod[nregime][i*nobschar:i*nobschar+nobschar]]) np.fill_diagonal(Q, -np.sum(Q,1))
[docs]def fit_hrm_model_likelihood(tree, chars, nregime, mod, pi="Equal", findmin=True): """ Likelihood function for fitting user-specified model """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] nobschar = len(set(chars)) nchar = nobschar*nregime nt = len(tree.descendants()) charlist = list(range(nchar)) var = create_hrm_ar(tree, chars, nregime, findmin) var["Q_layout"] = np.zeros([nregime,nobschar,nobschar]) def likelihood_function(Qparams, grad=None): """ NLOPT inputs the parameter array as well as a gradient object. The gradient object is ignored for this optimizer (LN_SBPLX) """ if any(np.isnan(Qparams)): return var["nullval"] if not all(sorted(Qparams) == Qparams): return var["nullval"] if (sum(Qparams) > (var["upperbound"]*2)) or any(Qparams <= 0): return var["nullval"] Qparams = np.insert(Qparams, 0, 1e-15) fill_model_Q(mod, Qparams, var["Q"]) # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: logli = hrm_mk(tree, chars, var["Q"],nregime, pi = pi, ar=var) if not np.isnan(logli): return x * logli# Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def cluster_models(tree, chars, Q, nregime, pi="Equal", findmin=True): """ Given an MLE Q, return candidate models with more parsimonious parameter set by merging similar parameters """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] Qparams = extract_Qparams(Q, nregime) ts = np.linspace(-10, 0, 11) Q_dist = np.array(list(zip(list(Qparams), [0]*len(Qparams)))) candidate_models = list(set([tuple(scipy.cluster.hierarchy.fclusterdata(Q_dist, i)) for i in ts])) if any(np.isclose(Qparams, 0.0)): for i,c in enumerate(candidate_models): candidate_models[i] = tuple([x-1 for x in c]) nmod = len(candidate_models) print(("Testing {} models".format(nmod))) alt_mods = {c:fit_hrm_model(tree,chars,nregime,c,pi=pi,findmin=findmin) for c in candidate_models} return alt_mods
[docs]def AIC(l, k): """ Akaike information criterion """ return 2*k - 2*l
[docs]def pairwise_merge(tree, chars, Q, nregime, pi="Equal"): """ Given an MLE Q, merge similar parameters pairwise to find more parsimonious models Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence Q (np.array): Instantaneous rate matrix nregime (int): Number of regimes in the model pi (str or np.array): Option to weight the root node by given values. Either a string containing the method or an array of weights. Weights should be given in order. Accepted methods of weighting root: Equal: flat prior Equilibrium: Prior equal to stationary distribution of Q matrix Fitzjohn: Root states weighted by how well they explain the data at the tips. """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] Qparams = extract_Qparams(Q, nregime) prev_mod = Qparams.argsort().argsort() + 1 nobschar = len(set(chars)) prev_l = hrm_mk(tree, chars, Q, nregime, pi=pi) prev_AIC = AIC(prev_l, len(Qparams)) prev_Q = Q.copy() new_Q = Q.copy() new_Qparams = Qparams.copy() prev_Qparams = Qparams.copy() # Matrix of distance between pairs dist_mat = abs(Qparams[..., np.newaxis] - Qparams[np.newaxis, ...]) dist_mat[np.tril_indices(len(Qparams))] = np.inf # Pre-allocated arrays to speed up likelihood calculations ar = create_hrm_ar(tree, chars, nregime) while 1: # Cleaning pre-allocated arrays np.copyto(ar["nodelist"], ar["nodelistOrig"]) ar["root_priors"].fill(1.0) # Merging parameters closest_pair = divmod(np.argmin(dist_mat), len(Qparams)) greater = max(prev_mod[list(closest_pair)]) new_mod = np.array([i if i<greater else i-1 for i in prev_mod]) new_param = np.mean(Qparams[list(closest_pair)]) inds = np.logical_or(Qparams==Qparams[closest_pair[0]], Qparams==Qparams[closest_pair[1]]) new_Qparams = Qparams.copy() new_Qparams[inds] = new_param # Calculating likelihood fill_model_Q(format_mod(new_mod, nregime, nobschar), np.insert(sorted(set(new_Qparams)), 0, 1e-15), new_Q) new_l = hrm_mk(tree,chars,new_Q,nregime,pi=pi,ar=ar) new_AIC = AIC(new_l, len(set(new_mod))) # Recording that we have checked this pair dist_mat[closest_pair[0], closest_pair[1]] = np.inf if new_AIC < prev_AIC: # Merging the two pairs dist_mat[closest_pair[1],] = np.inf dist_mat[:,closest_pair[1]] = np.inf # Recording new best model so far prev_mod = new_mod.copy() prev_AIC = new_AIC prev_l = new_l prev_Q = new_Q.copy() Qparams = new_Qparams.copy() if (dist_mat==np.inf).all(): break re_optim = fit_hrm_model(tree, chars, nregime, prev_mod, pi=pi, findmin=True, initialvals=sorted(set(Qparams))) return [prev_mod, re_optim[0], re_optim[1]]
[docs]def extract_Qparams(Q, nregime): nchar = Q.shape[0] nobschar = nchar/nregime n_wr = (nobschar**2-nobschar) n_br = (nregime**2-nregime)*nobschar Qparams = np.zeros([n_wr*nregime + n_br]) for i in range(nregime): subQ = slice(i*nobschar,(i+1)*nobschar) mask = np.ones([nobschar,nobschar], dtype=bool) mask[np.diag_indices(nobschar)]=False np.copyto(Qparams[i*n_wr:(i+1)*n_wr], Q[subQ,subQ][mask]) combs = list(itertools.combinations(list(range(nregime)),2)) revcombs = [tuple(reversed(i)) for i in combs] submatrix_indices = [x for s in [[combs[i]] + [revcombs[i]] for i in range(len(combs))] for x in s] for i,submatrix_index in enumerate(submatrix_indices): my_slice0 = slice(submatrix_index[0]*nobschar, (submatrix_index[0]+1)*nobschar) my_slice1 = slice(submatrix_index[1]*nobschar, (submatrix_index[1]+1)*nobschar) nregimeswitch =(nregime**2 - nregime)*2 Qparams[n_wr*nregime + i*nobschar:n_wr*nregime+(i+1)*nobschar] = Q[my_slice0,my_slice1][np.diag_indices(nobschar)] return Qparams
[docs]def format_mod(mod, nregime, nobschar): """ Convert flattened model to formatted model for use in fill_model_Q """ n_wr = nobschar**2 - nobschar mod_format = [tuple(mod[i:i+n_wr]) for i in range(0, n_wr*nregime, n_wr)] mod_format.append(tuple(mod[n_wr*nregime:])) return mod_format