241 lines
7.3 KiB
Cython
241 lines
7.3 KiB
Cython
# noinspection PyUnresolvedReferences
|
|
cimport numpy as np
|
|
import numpy as np
|
|
from cpython cimport array
|
|
import array
|
|
import cython
|
|
|
|
from cython.parallel import prange
|
|
from libc.stdlib cimport malloc, free
|
|
from libcpp.algorithm cimport sort
|
|
from libc.stdint cimport uint64_t
|
|
|
|
cpdef tuple k_means(float[:] data, unsigned int k=2):
|
|
cdef float[:] centers = np.empty(k, dtype=np.float32)
|
|
cdef list clusters = []
|
|
cdef set unique = set(data)
|
|
cdef unsigned long i
|
|
|
|
if len(unique) < k:
|
|
print("Warning: less different values than k")
|
|
k = len(unique)
|
|
|
|
for i in range(k):
|
|
centers[i] = unique.pop()
|
|
clusters.append([])
|
|
|
|
cdef float[:] old_centers = np.array(centers, dtype=np.float32)
|
|
cdef float distance, min_distance, error = 1.0
|
|
cdef unsigned int j, index = 0, N = len(data)
|
|
|
|
while error != 0:
|
|
for i in range(k):
|
|
clusters[i].clear()
|
|
|
|
for i in range(N):
|
|
min_distance = 999999999
|
|
for j in range(k):
|
|
distance = (centers[j] - data[i]) * (centers[j] - data[i])
|
|
if distance < min_distance:
|
|
min_distance = distance
|
|
index = j
|
|
clusters[index].append(data[i])
|
|
|
|
old_centers = np.array(centers)
|
|
for i in range(k):
|
|
centers[i] = np.mean(clusters[i])
|
|
|
|
error = 0.0
|
|
for i in range(k):
|
|
error += old_centers[i] * old_centers[i] - centers[i] * centers[i]
|
|
|
|
return centers, clusters
|
|
|
|
|
|
def segment_messages_from_magnitudes(cython.floating[:] magnitudes, float noise_threshold):
|
|
"""
|
|
Get the list of start, end indices of messages
|
|
|
|
:param magnitudes: Magnitudes of samples
|
|
:param q: Factor which controls how many samples of previous above noise plateau must be under noise to be counted as noise
|
|
:return:
|
|
"""
|
|
cdef list result = []
|
|
|
|
if len(magnitudes) == 0:
|
|
return []
|
|
|
|
cdef unsigned long i, N = len(magnitudes), start = 0
|
|
cdef unsigned long summed_message_samples = 0
|
|
|
|
# tolerance / robustness against outliers
|
|
cdef unsigned int outlier_tolerance = 10
|
|
cdef unsigned int conseq_above = 0, conseq_below = 0
|
|
|
|
# Three states: 1 = above noise, 0 = in noise, but not yet above k threshold (k * above_total), -1 = in noise
|
|
cdef int state
|
|
state = 1 if magnitudes[0] > noise_threshold else -1
|
|
|
|
cdef bint is_above_noise
|
|
|
|
for i in range(N):
|
|
is_above_noise = magnitudes[i] > noise_threshold
|
|
if state == 1:
|
|
if is_above_noise:
|
|
conseq_below = 0
|
|
else:
|
|
conseq_below += 1
|
|
elif state == -1:
|
|
if is_above_noise:
|
|
conseq_above += 1
|
|
else:
|
|
conseq_above = 0
|
|
|
|
# Perform state change if necessary
|
|
if state == 1 and conseq_below >= outlier_tolerance:
|
|
# 1 -> -1
|
|
state = -1
|
|
result.append((start, i - conseq_below))
|
|
summed_message_samples += (i-conseq_below) - start
|
|
conseq_below = conseq_above = 0
|
|
elif state == -1 and conseq_above >= outlier_tolerance:
|
|
# -1 -> 1
|
|
state = 1
|
|
start = i - conseq_above
|
|
conseq_below = conseq_above = 0
|
|
|
|
# append last message
|
|
if state == 1 and start < N - conseq_below:
|
|
result.append((start, N - conseq_below))
|
|
|
|
return result
|
|
|
|
cpdef uint64_t[:] get_threshold_divisor_histogram(uint64_t[:] plateau_lengths, float threshold=0.2):
|
|
"""
|
|
Get a histogram (i.e. count) how many times a value is a threshold divisor for other values in given data
|
|
|
|
Threshold divisible is defined as having a decimal place less than .2 (threshold)
|
|
|
|
:param plateau_lengths:
|
|
:return:
|
|
"""
|
|
cdef uint64_t i, j, x, y, minimum, maximum, num_lengths = len(plateau_lengths)
|
|
|
|
cdef np.ndarray[np.uint64_t, ndim=1] histogram = np.zeros(int(np.max(plateau_lengths)) + 1, dtype=np.uint64)
|
|
|
|
for i in range(0, num_lengths):
|
|
for j in range(i+1, num_lengths):
|
|
x = plateau_lengths[i]
|
|
y = plateau_lengths[j]
|
|
if x == 0 or y == 0:
|
|
continue
|
|
|
|
if x < y:
|
|
minimum = x
|
|
maximum = y
|
|
else:
|
|
minimum = y
|
|
maximum = x
|
|
|
|
if maximum / <double>minimum - (maximum / minimum) < threshold:
|
|
histogram[minimum] += 1
|
|
|
|
return histogram
|
|
|
|
cpdef np.ndarray[np.uint64_t, ndim=1] merge_plateaus(np.ndarray[np.uint64_t, ndim=1] plateaus,
|
|
uint64_t tolerance,
|
|
uint64_t max_count):
|
|
cdef uint64_t j, n, L = len(plateaus), current = 0, i = 1, tmp_sum
|
|
if L == 0:
|
|
return np.zeros(0, dtype=np.uint64)
|
|
|
|
cdef np.ndarray[np.uint64_t, ndim=1] result = np.empty(L, dtype=np.uint64)
|
|
if plateaus[0] <= tolerance:
|
|
result[0] = 0
|
|
else:
|
|
result[0] = plateaus[0]
|
|
|
|
while i < L and current < max_count:
|
|
if plateaus[i] <= tolerance:
|
|
# Look ahead to see whether we need to merge a larger window e.g. for 67, 1, 10, 1, 21
|
|
n = 2
|
|
while i + n < L and plateaus[i + n] <= tolerance:
|
|
n += 2
|
|
|
|
tmp_sum = 0
|
|
for j in range(i - 1, min(L, i + n)):
|
|
tmp_sum += plateaus[j]
|
|
|
|
result[current] = tmp_sum
|
|
i += n
|
|
else:
|
|
current += 1
|
|
result[current] = plateaus[i]
|
|
i += 1
|
|
|
|
return result[:current+1]
|
|
|
|
|
|
cpdef np.ndarray[np.uint64_t, ndim=1] get_plateau_lengths(float[:] rect_data, float center, int percentage=25):
|
|
if len(rect_data) == 0 or center is None:
|
|
return np.array([], dtype=np.uint64)
|
|
|
|
cdef int state, new_state
|
|
state = -1 if rect_data[0] <= center else 1
|
|
cdef unsigned long long plateau_length = 0
|
|
cdef unsigned long long current_sum = 0
|
|
cdef unsigned long long i = 0
|
|
cdef unsigned long long len_data = len(rect_data)
|
|
cdef float sample
|
|
|
|
cdef array.array result = array.array('Q', [])
|
|
|
|
for i in range(0, len_data):
|
|
if current_sum >= percentage * len_data / 100:
|
|
break
|
|
|
|
sample = rect_data[i]
|
|
new_state = -1 if sample <= center else 1
|
|
|
|
if state == new_state:
|
|
plateau_length += 1
|
|
else:
|
|
result.append(plateau_length)
|
|
current_sum += plateau_length
|
|
state = new_state
|
|
plateau_length = 1
|
|
|
|
return np.array(result, dtype=np.uint64)
|
|
|
|
|
|
cdef float median(double[:] data, unsigned long start, unsigned long data_len, unsigned int k=3) nogil:
|
|
cdef unsigned long i, j
|
|
|
|
if start + k > data_len:
|
|
k = data_len - start
|
|
|
|
cdef float* buffer = <float *>malloc(k * sizeof(float))
|
|
for i in range(0, k):
|
|
buffer[i] = data[start+i]
|
|
|
|
sort(&buffer[0], (&buffer[0]) + k)
|
|
try:
|
|
return buffer[k//2]
|
|
finally:
|
|
free(buffer)
|
|
|
|
cpdef np.ndarray[np.float32_t, ndim=1] median_filter(double[:] data, unsigned int k=3):
|
|
cdef long long start, end, i, n = len(data)
|
|
|
|
cdef np.ndarray[np.float32_t, ndim=1] result = np.zeros(n, dtype=np.float32)
|
|
|
|
for i in prange(0, n, nogil=True, schedule='static'):
|
|
if i < k // 2:
|
|
start = 0
|
|
else:
|
|
start = i - k // 2
|
|
|
|
result[i] = median(data, start=i, data_len=n, k=k)
|
|
|
|
return result
|