Source code for ivy.chars.mk_mr

# Mk multi regime models
from __future__ import absolute_import, division, print_function, unicode_literals

import ivy
import math
import random
import itertools
import types
from math import ceil

import line_profiler
import numpy as np
import scipy
import pymc
from scipy import special
from scipy.optimize import minimize
from scipy.special import binom

from ivy.chars.expokit import cyexpokit

try:
    StringTypes = types.StringTypes # Python 2
except AttributeError: # Python 3
    StringTypes = [str]
IDEG = 6 # Constant used in Cython/Fortran code.



[docs]def mk_mr_midbranch(tree, chars, Qs, switchpoint, pi="Equal", returnPi=False, ar = None, debug=False): """ Calculate likelhiood of mk model with BAMM-like multiple regimes Args: tree (Node): Root node of a tree. All branch lengths must be greater than 0 (except root) chars (dict): Dict mapping character states to tip labels. Character states should be coded 0,1,2... Can also be a list with tip states in preorder sequence Qs (np.array): Array of instantaneous rate matrices locs (np.array): Array of the same length as Qs containing the node indices that correspond to each Q matrix p (np.array): Optional pre-allocated p matrix pi (str or np.array): Option to weight the root node by given values. Either a string containing the method or an array of weights. Weights should be given in order. Accepted methods of weighting root: Equal: flat prior Equilibrium: Prior equal to stationary distribution of Q matrix Fitzjohn: Root states weighted by how well they explain the data at the tips. Returns: (float): Log-likelihood of tree/characters given the Q matrices. """ #################### # Step 1: setting up #################### # These lines of code ensure that the object switchpoint is of the correct # type even when passed in from PYMC if len(switchpoint)>0: if type(switchpoint[0]) == pymc.PyMCObjects.Stochastic: switchpoint = [i.value for i in switchpoint] # chars can be passed in as a dict. If it is, convert it to a list of ints in preorder sequence. if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] # Number of character states nchar = len(set(chars)) # Creation of preallocated arrays. if ar is None: ar = create_mkmr_mb_ar(tree,chars,nregime=Qs.shape[0],findmin=True) # chars is going to need to be reordered. chars = ar["chars"] # Get the actual nodes from the switchpoint tuple switchpoint_nodes = [ar["tree_copy"][switchpoint[i][0].id] for i in range(len(switchpoint))] # Reset t values ar["t"] = ar["blens"][:] + 1e-40 # Branch lengths can't be exactly zero. # Adjust t to represent the length of each switchpoint for s in switchpoint: sw = ar["tree_copy"][s[0].id].pi #Switchpoint sk = ar["tree_copy"][s[0].id].parent.pi #Switchpoint's parent ar["t"][sw] = s[1] ar["t"][sk] = ar["tree_copy"][s[0].id].parent.length - s[1] # inds corresponds to the tree in postorder sequence, with each value # corresponding to the regime that node is in. It is passed to dexpm3 switches_ni = [ar["tree_copy"][s[0].id].ni for s in switchpoint] + [0] cyexpokit.inds_from_switchpoint_p(np.array(switches_ni), ar["cladesize_preorder"], ar["clades_postorder_preorder"], ar["inds"]) (Qs != ar["prev_Q"]).any(axis=(1,2), out=ar["Qdif"]) ##################### # Step 2: create "pmask" ##################### # pmask is a mask array that keeps track of which p-matrices need to be # recalculated and which ones can use the values from the previous call. #Which Qs are different? Different Qs mean we have to recalculate p for that index np.logical_or.reduce((ar["prev_inds"]!=ar["inds"], ar["t"]!=ar["prev_t"],ar["Qdif"][ar["inds"]]),out=ar["pmask"]) # Which p-matrices do we need to recalcualte? ################## # Step 3: calculate p matrices ################## # Creating probability matrices from Q matrices and branch lengths # inds indicates which Q matrix to use for which branch # np.exp(ar["p"], out=ar["p"]) # Exponentiate p matrices Qs += 1e-40 cyexpokit.lndexpm3(Qs,ar["t"],np.array(ar["inds"]),ar["p"],ideg=IDEG,wsp=ar["wsp"],pmask=ar["pmask"].astype(int)) # Calculating the actual p matrix # np.log(ar["p"], out=ar["p"]) # Log p matrices ################# # Step 4: calculate likelihood ################# # Calculating the likelihoods for each node in post-order sequence np.copyto(ar["nodelist"], ar["nodelistOrig"]) # Resetting the starting values of nodelist. cyexpokit.cy_mk_log(ar["nodelist"], ar["p"], nchar, ar["tmp_ar"],ar["intnode_list"],ar["child_ar"]) # Performing the likelihood calculation # cyexpokit.cy_mk_log(ar["nodelist"], ar["p"], nchar, ar["tmp_ar"],ar["intnode_list"],ar["child_ar"]) ################ # Step 5: apply root prior ################ # The last row of nodelist contains the likelihood values at the root # Applying the correct root prior if not type(pi) in StringTypes: assert len(pi) == nchar, "length of given pi does not match Q dimensions" assert str(type(pi)) == "<type 'numpy.ndarray'>", "pi must be str or numpy array" assert np.isclose(sum(pi), 1), "values of given pi must sum to 1" np.copyto(ar["root_priors"], pi) rootliks = ([ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ]) elif pi == "Equal": ar["root_priors"].fill(1.0/nchar) rootliks = [ i+np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1])] elif pi == "Fitzjohn": np.copyto(ar["root_priors"], [ar["nodelist"][-1,:-1][charstate]- scipy.misc.logsumexp(ar["nodelist"][-1,:-1]) for charstate in set(chars) ]) rootliks = [ ar["nodelist"][-1,:-1][charstate] + ar["root_priors"][charstate] for charstate in set(chars) ] elif pi == "Equilibrium": # Equilibrium pi from the stationary distribution of Q np.copyto(ar["root_priors"],qsd(Q)) rootliks = [ i + np.log(ar["root_priors"][n]) for n,i in enumerate(ar["nodelist"][-1,:-1]) ] logli = scipy.misc.logsumexp(rootliks) ############### # Cleanup and storing values for next call to this function ############## ar["prev_t"][:] = ar["t"][:] ar["prev_inds"][:] = ar["inds"][:] ar["prev_Q"][:] = Qs[:] if returnPi: return (logli, {k:v for k,v in enumerate(ar["root_priors"])}) else: return logli
[docs]def create_mkmr_ar(tree, chars,nregime,findmin = True): """ Create preallocated arrays. For use in mk function Nodelist = edgelist of nodes in postorder sequence """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] t = np.array([node.length for node in tree.postiter() if not node.isroot], dtype=np.double) # t is in POSTORDER SEQUENCE nt = len(tree.descendants()) nchar = len(set(chars)) preleaves = [ n for n in tree.preiter() if n.isleaf ] postleaves = [n for n in tree.postiter() if n.isleaf ] postnodes = list(tree.postiter()) postChars = [ chars[i] for i in [ preleaves.index(n) for n in postleaves ] ] nnode = len(t)+1 nodelist = np.zeros((nnode, nchar+1)) nodelist.fill(-np.inf) # the log of 0 is negative infinity leafind = [ n.isleaf for n in tree.postiter()] for k,ch in enumerate(postChars): [ n for i,n in enumerate(nodelist) if leafind[i] ][k][ch] = np.log(1.0) for i,n in enumerate(nodelist[:-1]): n[nchar] = postnodes.index(postnodes[i].parent) # Setting initial node likelihoods to log one for calculations nodelist[[ i for i,b in enumerate(leafind) if not b],:-1] = np.log(1.0) # Empty Q matrix Q = np.zeros([nregime,nchar, nchar], dtype=np.double) # Empty p matrix p = np.empty([nt, nchar, nchar], dtype = np.double, order="C") nodelistOrig = nodelist.copy() rootpriors = np.empty([nchar], dtype=np.double) if findmin: nullval = np.inf else: nullval = -np.inf treelen = sum([ n.length for n in tree.leaves()[0].rootpath() if n.length]+[ tree.leaves()[0].length]) upperbound = len(tree.leaves())/treelen charlist = list(range(nchar)) tmp_ar = np.zeros(nchar) # Used for storing calculations wsp = np.empty(4*nchar*nchar+IDEG+1) var = {"Q": Q, "p": p, "t":t, "nodelist":nodelist, "charlist":charlist, "nodelistOrig":nodelistOrig, "upperbound":upperbound, "root_priors":rootpriors, "nullval":nullval, "tmp_ar":tmp_ar} return var
[docs]def create_mkmr_mb_ar(tree, chars,nregime,findmin = True): """ Preallocated arrays for midbranch mk_mr Create preallocated arrays. For use in mk_mr midbranch function Nodelist = edgelist of nodes in postorder sequence """ if type(chars) != dict: chars = {tree.leaves()[i].label:v for i,v in enumerate(chars)} tree_copy_ = tree.copy() for n in tree_copy_.descendants(): n.meta["cached"]=False n.bisect_branch(1e-15, reindex=False) # Here we break up the tree so that each node has a "knee" # for a parent. This knee starts with a length of 0, effectively # making it nonexistant, but can have its length changed # to act as a switchpoint. tree_copy_str = tree_copy_.write() tree_copy = ivy.tree.read(tree_copy_str) for n1,n2 in zip(tree_copy_.preiter(), tree_copy.preiter()): n2.id = n1.id for n in tree_copy: n.cladesize = len(n) chars = [chars[l] for l in [n.label for n in tree_copy.leaves()]] blens = np.array([node.length for node in tree_copy.postiter() if not node.isroot]) t = np.array(blens, dtype=np.double) prev_t = t.copy() prev_t.fill(np.inf) nt = len(t) nchar = len(set(chars)) preleaves = [ n for n in tree_copy.preiter() if n.isleaf ] postleaves = [n for n in tree_copy.postiter() if n.isleaf ] postnodes = list(tree_copy.postiter()) edgelist = [-np.inf] * len(tree_copy) for i in range(len(edgelist)-1): edgelist[i] = postnodes[i].parent.pi postChars = [ chars[i] for i in [ preleaves.index(n) for n in postleaves ] ] nnode = len(t)+1 nodelist = np.zeros((nnode, nchar+1)) nodelist.fill(-np.inf) # the log of 0 is negative infinity leafind = [ n.isleaf for n in tree_copy.postiter()] for k,ch in enumerate(postChars): [ n for i,n in enumerate(nodelist) if leafind[i] ][k][ch] = np.log(1.0) nodelist[:,-1] = edgelist # Setting initial node likelihoods to log one for calculations nodelist[[ i for i,b in enumerate(leafind) if not b],:-1] = np.log(1.0) # Empty Q matrix Q = np.zeros([nregime,nchar, nchar], dtype=np.double) prev_Q = Q.copy() prev_Q.fill(np.inf) # Empty p matrix p = np.empty([nt, nchar, nchar], dtype = np.double, order="C") pmask = np.array([True]*len(t)) nodelistOrig = nodelist.copy() rootpriors = np.empty([nchar], dtype=np.double) if findmin: nullval = np.inf else: nullval = -np.inf treelen = sum([ n.length for n in tree_copy.leaves()[0].rootpath() if n.length]+[ tree_copy.leaves()[0].length]) upperbound = len(tree_copy.leaves())/treelen charlist = list(range(nchar)) tmp_ar = np.zeros(nchar) # Used for storing calculations pre_post = np.array([n.pi for n in tree_copy]) # The postorder index that corresponds to each preorder index prev_inds = np.zeros(len(t), dtype=int) #Keeping track of locations from previous call Qdif = np.ones([nregime], dtype=bool) # Array for keeping track of changes to Q locs = np.zeros(nregime, dtype=object) inds = np.zeros(nt, dtype=int) max_children = max(len(n.children) for n in tree_copy) child_ar = np.empty([tree_copy.cladesize,max_children], dtype=np.int64) child_ar.fill(-1) intnode_list = np.array(sorted(set(nodelist[:-1,nchar])), dtype=int) for intnode in intnode_list: children = np.where(nodelist[:,nchar]==intnode)[0] child_ar[int(intnode)][:len(children)] = children wsp = np.empty(4*nchar*nchar+IDEG+1) cladesize_preorder = np.array([len(n) for n in tree_copy]) clades_postorder_preorder = np.zeros([len(tree_copy),len(tree_copy)]) clades_postorder_preorder -= 1 for ni,node in enumerate(tree_copy): clades_postorder_preorder[ni][:cladesize_preorder[ni]] = [x.pi for x in node] var = {"Q": Q, "p": p, "t":t, "nodelist":nodelist, "charlist":charlist, "nodelistOrig":nodelistOrig, "upperbound":upperbound, "root_priors":rootpriors, "nullval":nullval, "tmp_ar":tmp_ar, "tree_copy":tree_copy,"chars":chars,"pre_post":pre_post,"prev_Q":prev_Q, "prev_t":prev_t, "blens":blens,"pmask":pmask, "prev_inds":prev_inds, "locs":locs,"Qdif":Qdif, "intnode_list":intnode_list,"child_ar":child_ar,"wsp":wsp, "inds":inds,"cladesize_preorder":cladesize_preorder, "clades_postorder_preorder":clades_postorder_preorder} return var
[docs]def create_likelihood_function_multimk(tree, chars, Qtype, nregime, pi="Equal", findmin = True): if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] if findmin: nullval = np.inf else: nullval = -np.inf nchar = len(set(chars)) nt = len(tree.descendants()) charlist = list(range(nchar)) # Number of parameters per Q matrix n_qp = nchar**2-nchar # Empty Q matrix Q = np.zeros([nregime,nchar,nchar], dtype=np.double) # Empty p matrix p = np.empty([nt, nchar, nchar], dtype = np.double, order="C") # Empty likelihood array var = create_mkmr_ar(tree, chars,nregime,findmin) def likelihood_function(Qparams, locs): # Enforcing upper bound on parameters Qparams = [float(qp) for qp in Qparams] # # TODO: replace with sum of each Q if (np.sum(Qparams)/len(locs) > var["upperbound"]) or any([x<=0 for x in Qparams]): return var["nullval"] # if not (Qparams == sorted(Qparams)): # return var["nullval"] # Filling Q matrices: Qparams = [Qparams[i:i+n_qp] for i in range(nregime)] if Qtype == "ER": for i,qmat in enumerate(var["Q"]): qmat.fill(float(Qparams[i])) qmat[np.diag_indices(nchar)] = -Qparams[i] * (nchar-1) elif Qtype == "ARD": for i,qmat in enumerate(var["Q"]): qmat.fill(0.0) # Re-filling with zeroes qmat[np.triu_indices(nchar, k=1)] = Qparams[i][:len(Qparams[i])/2] qmat[np.tril_indices(nchar, k=-1)] = Qparams[i][len(Qparams[i])/2:] qmat[np.diag_indices(nchar)] = 0-np.sum(qmat, 1) else: raise ValueError("Qtype must be one of: ER, Sym, ARD") # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: return x * mk_mr(tree, chars, var["Q"], locs, pi = pi, ar=var) # Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def lf_mk_mr_midbranch(tree, chars, Qtype, nregime, pi="Equal", findmin = True): if type(chars) == dict: chardict = chars chars = [chars[l] for l in [n.label for n in tree.leaves()]] else: chardict = {tree.leaves()[i].label:v for i,v in enumerate(chars)} nchar = len(set(chars)) # Empty likelihood array var = create_mkmr_mb_ar(tree, chardict,nregime,findmin) # Number of parameters per Q matrix n_qp = nchar**2-nchar def likelihood_function(Qparams, switchpoint): # Enforcing upper bound on parameters Qparams = [float(qp) for qp in Qparams] # # TODO: replace with sum of each Q if (np.sum(Qparams)/nregime > var["upperbound"]) or any([x<=0 for x in Qparams]): return var["nullval"] # if not (Qparams == sorted(Qparams)): # return var["nullval"] # Filling Q matrices: Qparams = [Qparams[i*n_qp:i*n_qp+n_qp] for i in range(nregime)] if Qtype == "ER": for i,qmat in enumerate(var["Q"]): qmat.fill(float(Qparams[i])) qmat[np.diag_indices(nchar)] = -Qparams[i] * (nchar-1) elif Qtype == "ARD": for i,qmat in enumerate(var["Q"]): qmat.fill(0.0) # Re-filling with zeroes qmat[np.triu_indices(nchar, k=1)] = Qparams[i][:len(Qparams[i])//2] qmat[np.tril_indices(nchar, k=-1)] = Qparams[i][len(Qparams[i])//2:] qmat[np.diag_indices(nchar)] = 0-np.sum(qmat, 1) else: raise ValueError("Qtype must be one of: ER, Sym, ARD") # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: return x * mk_mr_midbranch(tree, chars, var["Q"], switchpoint, pi = pi, ar=var) # Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def create_likelihood_function_multimk_mods(tree, chars, mods, pi="Equal", findmin = True, orderedparams=True): """ Create a likelihood function for testing the parameter values of user- specified models """ if type(chars) == dict: chars = [chars[l] for l in [n.label for n in tree.leaves()]] if findmin: nullval = np.inf else: nullval = -np.inf nregime = len(mods) nchar = len(set(chars)) nt = len(tree.descendants()) charlist = list(range(nchar)) nparam = len(set([i for s in mods for i in s])) # Empty likelihood array var = create_mkmr_ar(tree, chars,nregime,findmin) def likelihood_function(Qparams, locs): # Enforcing upper bound on parameters Qparams = np.insert(Qparams, 0, 1e-15) # # TODO: replace with sum of each Q if (np.sum(Qparams)/len(locs) > var["upperbound"]) or any([x<=0 for x in Qparams]): return var["nullval"] if orderedparams: if any(Qparams != sorted(Qparams)): return var["nullval"] # Clearing Q matrices var["Q"].fill(0.0) # Filling Q matrices: fill_model_mr_Q(Qparams, mods, var["Q"]) # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: return x * mk_mr(tree, chars, var["Q"], locs, pi = pi, ar=var) # Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def lf_mk_mr_midbranch_mods(tree, chars, mods, pi="Equal", findmin = True, orderedparams=True): """ Create a likelihood function for testing the parameter values of user- specified models with switchpoints allowed in the middle of branches """ if type(chars) == dict: chardict = chars chars = [chars[l] for l in [n.label for n in tree.leaves()]] else: chardict = {tree.leaves()[i].label:v for i,v in enumerate(chars)} nregime = len(mods) nparam = len(set([i for s in mods for i in s])) # Empty likelihood array var = create_mkmr_mb_ar(tree, chardict,nregime,findmin) def likelihood_function(Qparams, switchpoint): # Enforcing upper bound on parameters Qparams = np.insert(Qparams, 0, 1e-15) # # TODO: replace with sum of each Q if (np.sum(Qparams)/nregime > var["upperbound"]) or any([x<=0 for x in Qparams]): return var["nullval"] if orderedparams: if any(Qparams != sorted(Qparams)): return var["nullval"] # Clearing Q matrices var["Q"].fill(0.0) # Filling Q matrices: fill_model_mr_Q(Qparams, mods, var["Q"]) # Resetting the values in these arrays np.copyto(var["nodelist"], var["nodelistOrig"]) var["root_priors"].fill(1.0) if findmin: x = -1 else: x = 1 try: return x * mk_mr_midbranch(tree, chars, var["Q"], switchpoint, pi = pi, ar=var) # Minimizing negative log-likelihood except ValueError: # If likelihood returned is 0 return var["nullval"] return likelihood_function
[docs]def fill_model_mr_Q(Qparams, mods, Q): """ Fill a Q-matrix with Qparams corresponding to the model. Individual Q matrices are filled rowwise Function alters Q in place """ # Filling Q matrix row-wise (skipping diagonals) nregime = len(mods) nchar=Q.shape[1] for regime in range(nregime): modcount = 0 # Which mod index are we on for r,c in itertools.product(list(range(nchar)),repeat=2): if r==c: # Skip diagonals pass else: Q[regime,r,c] = Qparams[mods[regime][modcount]] modcount+=1 np.fill_diagonal(Q[regime], -np.sum(Q[regime], axis=1)) # Filling diagonals
[docs]def locs_from_switchpoint(tree, switches, locs=None): """ Given a tree and a list of nodes to be switchpoints, return an array of all node indices in one regime vs the other Args: tree (Node): Root node of tree switches (list): List of internal nodes to act as switchpoints. Switchpoint nodes are part of their own regimes. locs (np.array): Optional pre-allocated array to store output. Returns: np.array: Array of indices of all nodes associated with each switchpoint, plus all nodes "outside" the switches in the last list """ # Include the root switches = switches + [tree] # Sort switches by clade size # We need the "cladesize" attribute to sort the clades. If it does not # exist on the tree, create it if not hasattr(tree, "cladesize"): for n in tree: n.cladesize = len(n) switches_c = switches[:] switches_c.sort(key=lambda x: x.cladesize) if locs is None: out = True locs = np.zeros(len(switches_c), dtype=object) else: out = False locs.fill(0) # Clear locs array for i,s in enumerate(switches_c): t_l = [] # Add descendants of node to location array if they are not # already recorded. This approach works because the clade sizes # were sorted beforehand viewed = set([x for l in locs[:i] for x in l]) for n in tree[s]: if not n.ni in viewed: t_l.append(n.ni) locs[i] = t_l # Remove the root locs[-1] = locs[-1][1:] # Return locs in proper order (locations descended from each switch in order, # with locations descended from the root last) locs[:] =locs[[switches_c.index(i) for i in switches]][:] if out: return locs
[docs]def inds_from_switchpoint_cython(tree, switches_ni, ar,debug=False): nr = len(switches_ni)-1 switches_ni_c = switches_ni[:] switches_ni_c.sort(key=lambda x: ar["cladesize_preorder"][x]) switches_key = [switches_ni_c.index(i) for i in switches_ni] for i,s in enumerate(switches_ni_c[::-1]): for n in ar["clades_postorder_preorder"][s]: if not n==len(ar["cladesize_preorder"])-1: # Ignore the root ar["inds"][n] = switches_key[nr-i]
[docs]class ModelOrderMetropolis(pymc.Metropolis): """ Custom step method for model order """ def __init__(self, stochastic): pymc.Metropolis.__init__(self, stochastic, scale=1., proposal_distribution="prior")
[docs] def propose(self): cur_order = self.stochastic.value indexes_to_swap = random.sample(range(len(cur_order)),2) new_order = cur_order[:] # Swap values new_order[indexes_to_swap[0]], new_order[indexes_to_swap[1]] = new_order[indexes_to_swap[1]], new_order[indexes_to_swap[0]] self.stochastic.value = new_order
[docs] def reject(self): self.rejected += 1 self.stochastic.value = self.stochastic.last_value
[docs] def competence(self): return 0
[docs]class SwitchpointMetropolis(pymc.Metropolis): """ Custom step algorithm for selecting a new switchpoint """ def __init__(self, stochastic, tree, seg_map, stepsize=0.05, seglen=0.02): pymc.Metropolis.__init__(self, stochastic, scale=0,proposal_sd=1, proposal_distribution="prior") root_to_tip_length = tree.max_tippath() self.tree = tree self.seg_map = seg_map self.stepsize = stepsize * root_to_tip_length self.seg_size = seglen * root_to_tip_length
[docs] def propose(self): # Following BAMM, switchpoint movements can be either global or # local, with global switch happening 1/10 times if (random.choice(range(10))==0): self.global_step() else: self.local_step()
[docs] def local_step(self): prev_location = self.stochastic.value new_location = cyexpokit.local_step(prev_location,self.stepsize,self.seg_size,self.adaptive_scale_factor) self.stochastic.value = new_location
[docs] def global_step(self): self.stochastic.value = cyexpokit.random_tree_location(self.seg_map)
[docs] def reject(self): self.rejected += 1 self.stochastic.value = self.stochastic.last_value
[docs] def competence(self): return 0
[docs]def round_step_size(step_size, seg_size): """ Round step_size to the nearest segment """ if (step_size%seg_size) > (seg_size/2): return step_size + (seg_size-(step_size%seg_size)) + 1e-15 else: return step_size - (step_size%seg_size) + 1e-15
[docs]def tree_map(tree, seglen=0.02): """ Make a map of the tree cut into segments of size (seglen*root_to_tip_length) """ root_to_tip_length = tree.max_tippath() seg_size = seglen * root_to_tip_length seg_map = [] seen = [tree] for node in tree.descendants(): cur_len = node.length nseg = int(ceil(cur_len/seg_size)) for n in range(nseg): seg_map.append((node, n*seg_size+1e-15)) return np.array(seg_map,dtype=object)
[docs]def random_tree_location(seg_map): """ Select random location on tree with uniform probability Returns: tuple: node and float, which represents the how far along the branch to the parent node the switch occurs on """ i = random.choice(range(len(seg_map))) return seg_map[i]
[docs]def make_switchpoint_stoch(seg_map, name=str("switch")): startingval = cyexpokit.random_tree_location(seg_map) @pymc.stochastic(dtype=object, name=name) def switchpoint_stoch(value = startingval): # Flat prior on switchpoint location return 0 return switchpoint_stoch
[docs]def make_modelorder_stoch(mods, name=str("modorder")): startingval = mods @pymc.stochastic(dtype=tuple, name=name) def modelorder_stoch(value=startingval): return 0 return modelorder_stoch
[docs]def mk_multi_bayes(tree, chars,nregime,qidx, pi="Equal" ,seglen=0.02,stepsize=0.05): """ Create a Bayesian multi-mk model. User specifies which regime models to use and the Bayesian model finds the switchpoints. Args: tree (Node): Root node of tree. chars (dict): Dict mapping tip labels to discrete character states. Character states must be in the form of [0,1,2...] regime (int): The number of distinct regimes to test. Set to 1 for an Mk model, set to greater than 1 for a multi-regime Mk model. qidx (np.array): Index specifying the model to test columns: 0, 1, 2 - index axes of q 3 - index of params This scheme allows flexible specification of models. E.g.: Symmetric mk2: params = [0.2]; qidx = [[0,0,1,0], [0,1,0,0]] Asymmetric mk2: params = [0.2,0.6]; qidx = [[0,0,1,0], [0,1,0,1]] NOTE: The qidx corresponding to the first q matrix (first column 0) is always the root regime pi (str or np.array): Option to weight the root node by given values. Either a string containing the method or an array of weights. Weights should be given in order. Accepted methods of weighting root: Equal: flat prior Equilibrium: Prior equal to stationary distribution of Q matrix Fitzjohn: Root states weighted by how well they explain the data at the tips. seglen (float): Size of segments to break tree into. The smaller this value, the more "fine-grained" the analysis will be. Optional, defaults to 2% of the root-to-tip length. stepsize (float): Maximum size of steps for switchpoints to take. Optional, defaults to 5% of root-to-tip length. """ if type(chars) == dict: data = chars.copy() chars = [chars[l] for l in [n.label for n in tree.leaves()]] else: data = dict(zip([n.label for n in tree.leaves()],chars)) # Preparations nchar = len(set(chars)) nparam = len(set([n[-1] for n in qidx])) # This model has 2 components: Q parameters and switchpoints # They are combined in a custom likelihood function ########################################################################### # Switchpoint: ########################################################################### # Modeling the movement of the regime shift(s) is the tricky part # Regime shifts will only be allowed to happen at a node seg_map = tree_map(tree,seglen) switch = [None]*(nregime-1) for regime in range(nregime-1): switch[regime]= make_switchpoint_stoch(seg_map, name=str("switch_{}".format(regime))) ########################################################################### # Qparams: ########################################################################### # Each Q parameter is an exponential Qparams = [None] * nparam for i in range(nparam): Qparams[i] = pymc.Exponential(name=str("Qparam_{}".format(i)), beta=1.0, value=0.1*(i+1)) ########################################################################### # Likelihood ########################################################################### # The likelihood function l = cyexpokit.make_mklnl_func(tree, data,nchar,nregime,qidx) @pymc.deterministic def likelihood(q = Qparams, s=switch,name="likelihood"): return l(np.array(q),np.array([x[0].ni for x in s],dtype=np.intp),np.array([x[1] for x in s])) @pymc.potential def multi_mklik(lnl=likelihood): if not (np.isnan(lnl)): return lnl else: return -np.inf mod = pymc.MCMC(locals()) for s in switch: mod.use_step_method(SwitchpointMetropolis, s, tree, seg_map,stepsize=stepsize,seglen=seglen) return mod