Source code for galsampler.crossmatch
"""
"""
import numpy as np
__all__ = ("crossmatch", "compute_richness")
[docs]
def crossmatch(x, y, skip_bounds_checking=False):
"""
Finds where the elements of ``x`` appear in the array ``y``, including repeats.
The elements in x may be repeated, but the elements in y must be unique.
The arrays x and y may be only partially overlapping.
Applications of this function involve cross-matching two catalogs which share an ID.
Parameters
----------
x : integer array
Array of integers with possibly repeated entries.
y : integer array
Array of unique integers.
skip_bounds_checking : bool, optional
The first step in the `crossmatch` function is to test that the input
arrays satisfy the assumptions of the algorithm
(namely that ``x`` and ``y`` store integers,
and that all values in ``y`` are unique).
If ``skip_bounds_checking`` is set to True,
this testing is bypassed and the function evaluates faster.
Default is False.
Returns
-------
idx_x : integer array
Integer array used to apply a mask to x
such that x[idx_x] = y[idx_y]
y_idx : integer array
Integer array used to apply a mask to y
such that x[idx_x] = y[idx_y]
"""
# Ensure inputs are Numpy arrays
x = np.atleast_1d(x)
y = np.atleast_1d(y)
# Require that the inputs meet the assumptions of the algorithm
if skip_bounds_checking is True:
pass
else:
try:
assert len(set(y)) == len(y)
assert np.all(np.array(y, dtype=np.int64) == y)
assert np.shape(y) == (len(y),)
except (AssertionError, ValueError, TypeError):
msg = "Input array y must be a 1d sequence of unique integers"
raise ValueError(msg)
try:
assert np.all(np.array(x, dtype=np.int64) == x)
assert np.shape(x) == (len(x),)
except (AssertionError, ValueError, TypeError):
msg = "Input array x must be a 1d sequence of integers"
raise ValueError(msg)
# Internally, we will work with sorted arrays, and then undo the sorting at the end
idx_x_sorted = np.argsort(x)
idx_y_sorted = np.argsort(y)
x_sorted = np.copy(x[idx_x_sorted])
y_sorted = np.copy(y[idx_y_sorted])
# x may have repeated entries
# Address by finding the unique values as well as their multiplicity
unique_xvals, counts = np.unique(x_sorted, return_counts=True)
# Determine which of the unique x values has a match in y
unique_xval_has_match = np.in1d(unique_xvals, y_sorted, assume_unique=True)
# Create a boolean array with True for each value in x with a match, otherwise False
idx_x = np.repeat(unique_xval_has_match, counts)
# For each unique value of x with a match in y, identify the index of the match
matching_indices_in_y = np.searchsorted(
y_sorted, unique_xvals[unique_xval_has_match]
)
# Repeat each matching index according to the multiplicity in x
idx_y = np.repeat(matching_indices_in_y, counts[unique_xval_has_match])
# Undo the original sorting and return the result
return idx_x_sorted[idx_x], idx_y_sorted[idx_y]
[docs]
def compute_richness(unique_halo_ids, halo_id_of_galaxies):
r"""For every ID in unique_halo_ids,
calculate the number of times the ID appears in halo_id_of_galaxies.
Parameters
----------
unique_halo_ids : ndarray
Numpy array of shape (num_halos, ) storing unique integers
halo_id_of_galaxies : ndarray
Numpy integer array of shape (num_galaxies, ) storing the host ID of each galaxy
Returns
-------
richness : ndarray
Numpy integer array of shape (num_halos, ) storing richness of each host halo
Examples
--------
>>> num_hosts = 100
>>> num_sats = int(1e5)
>>> unique_halo_ids = np.arange(5, num_hosts + 5)
>>> halo_id_of_galaxies = np.random.randint(0, 5000, num_sats)
>>> richness = compute_richness(unique_halo_ids, halo_id_of_galaxies)
"""
unique_halo_ids = np.atleast_1d(unique_halo_ids).astype(int)
halo_id_of_galaxies = np.atleast_1d(halo_id_of_galaxies).astype(int)
richness_result = np.zeros_like(unique_halo_ids).astype(int)
vals, counts = np.unique(halo_id_of_galaxies, return_counts=True)
idxA, idxB = crossmatch(vals, unique_halo_ids)
richness_result[idxB] = counts[idxA]
return richness_result