import os
import arviz as az
import matplotlib.pyplot as plt
import corner
import pandas as pd
import seaborn as sns
import numpy as np
import jax.numpy as jnp
from jax import random
import numpyro
import numpyro.distributions as dist
import numpyro.optim as optim
# from numpyro.infer import SVI, Trace_ELBO
# from numpyro.infer.autoguide import AutoLaplaceApproximation
if "SVG" in os.environ:
%config InlineBackend.figure_formats = ["svg"]
az.style.use("arviz-darkgrid")
# numpyro.set_platform("cpu")
Input data: $$D ≝ \big\{(\underbrace{w_i}_{\text{website } i}, \overbrace{m_j}^{\text{MAC address } j}, \underbrace{t_k}_{\text{timestamp } k})\big\}_{(i, j, k) ∈ I × J × K}$$
For every $m_j$: $$n_j \sim \text{Geometric}(p)$$ where
$n_j$ is the number of users (scammers + the real user) claiming to have MAC address $m_j$
For every $w_i$: $$\begin{cases} \quad λ_1^{(i, j)} \sim \text{Gamma}(α^+_i, 1) \\ ∀ 2 ≤ \ell ≤ n_j, \quad λ_\ell^{(i, j)} \sim \text{Gamma}(α^{-}_i, 1)\end{cases}$$ where
For every $1 ≤ k' ≤ k$ such that $(w_i, m_j, t_k) ∈ D$: $$t_{k'+1}-t_{k'} \overset{\text{obs}}{\sim} \text{Exp}\Big(\sum_{\ell} λ_\ell^{(i, j)}\Big)$$ where
from faker import Faker
import dumper
# Proportion of legit (non scammers) users in the population
p_legit = 0.95
def generate_data(nb_users = 1000, nb_websites = 100, p_legit=p_legit, nb_days = 30, seed=1):
"""Generate synthetic data:
Input:
- nb_users users
- nb_websites websites
- p_legit probability for a user to be legit
- seed for reproducibility
Creates a dictionary mapping users (legit and scammers) to mac addresses (scammers use a mac address already used by a legit user) and returns:
Output:
- a list of users (legit and scammers)
- a list of websites
- a list of mac addresses
- a list of visits (mac address, website, timestamp) over nb_days days
"""
fake = Faker()
Faker.seed(seed)
legit_users, scammers = [], []
key = random.PRNGKey(seed)
for _ in range(nb_users):
key, subkey = random.split(key)
is_legit = random.bernoulli(subkey, p_legit)
legit_users.append(fake.name()) if is_legit else scammers.append(fake.name())
users = legit_users + scammers
legit_users = np.array(legit_users)
scammers = np.array(scammers)
websites = [fake.domain_name() for i in range(nb_websites)]
mac_addresses = np.array([fake.mac_address() for i in range(len(legit_users))])
users_mac = dict(zip(users, mac_addresses))
for scammer in scammers:
users_mac[scammer] = np.random.choice(mac_addresses)
# Proportion of legit users visiting each website
key, subkey = random.split(key)
p_legit_website = random.uniform(subkey, (nb_websites,))
visits = []
for i, website in enumerate(websites):
key, subkey = random.split(key)
nb_visits = random.poisson(subkey, p_legit_website[i] * nb_users)
visiting_legit_users = np.random.choice(legit_users, (nb_visits,))
visiting_scammers = np.random.choice(scammers, (nb_visits - len(visiting_legit_users),))
key, subkey = random.split(key)
poisson_rate_legit = 20
for user in visiting_legit_users:
key, subkey = random.split(key)
nb_visits = random.poisson(subkey, poisson_rate_legit)
key, subkey = random.split(key)
timestamps = random.uniform(subkey, (nb_visits,), minval=0, maxval=nb_days)
visits.extend([(users_mac[user], website, timestamp) for timestamp in timestamps])
key, subkey = random.split(key)
poisson_rate_scam = 30
for user in visiting_scammers:
key, subkey = random.split(key)
nb_visits = random.poisson(subkey, poisson_rate_scam)
key, subkey = random.split(key)
timestamps = random.uniform(subkey, (nb_visits,), minval=0, maxval=nb_days)
visits.extend([(users_mac[user], website, timestamp) for timestamp in timestamps])
return users, websites, mac_addresses, visits
from numpyro import distributions as dist, infer
numpyro.set_host_device_count(2)
def model(websites, mac_addresses, visits, p_legit):
for j, m_j in enumerate(mac_addresses):
# sample n_j from a Geometric distribution
n_j = numpyro.sample(f"n_{j}", dist.Geometric(p_legit))
λ = {}
# For every website, sample a vector of alpha (shape parameters) normally distributed with mean 15 and std 5 (for legit users) and mean 20 and std 5 (for scammers)
αs_legit = numpyro.sample(f"αs_legit", dist.Normal(15 * jnp.ones(len(websites)), 5 * jnp.ones(len(websites))))
αs_scam = numpyro.sample(f"αs_scam", dist.Normal(20 * jnp.ones(len(websites)), 5 * jnp.ones(len(websites))))
for i, w_i in enumerate(websites):
# sample λ rate of legit user from a Gamma distribution
λ[(i, j, 0)] = numpyro.sample(f"λ_{(i,j,1)}", dist.Gamma(αs_legit[i], 1))
for l in range(1, n_j):
# sample λ rates of scammers user from a Gamma distribution
λ[(i, j, l)] = numpyro.sample(f"λ_{(i,j,l)}", dist.Gamma(αs_scam[i], 1))
# Collect timestamps of visits of user j on website i
timestamps = jnp.sort([visit[2] for visit in visits if visit[0] == m_j and visit[1] == w_i])
# Compute time intervals between visits
time_intervals = jnp.diff(timestamps)
sum_λ = jnp.sum(jnp.array([λ[(i, j, l)] for l in range(n_j)]))
numpyro.sample("time_intervals", dist.Exp(sum_λ), obs=time_intervals)
users, websites, mac_addresses, visits = generate_data()
sampler = infer.MCMC(
infer.NUTS(model),
num_warmup=100,
num_samples=500,
num_chains=2,
progress_bar=True,
)
%time sampler.run(random.PRNGKey(0), websites, mac_addresses, visits, p_legit)