In [ ]:
import os

import arviz as az
import matplotlib.pyplot as plt
import corner
import pandas as pd
import seaborn as sns

import numpy as np

import jax.numpy as jnp
from jax import random

import numpyro
import numpyro.distributions as dist
import numpyro.optim as optim
# from numpyro.infer import SVI, Trace_ELBO
# from numpyro.infer.autoguide import AutoLaplaceApproximation

if "SVG" in os.environ:
    %config InlineBackend.figure_formats = ["svg"]
az.style.use("arviz-darkgrid")
# numpyro.set_platform("cpu")

Statistical model

Input data: $$D ≝ \big\{(\underbrace{w_i}_{\text{website } i}, \overbrace{m_j}^{\text{MAC address } j}, \underbrace{t_k}_{\text{timestamp } k})\big\}_{(i, j, k) ∈ I × J × K}$$

For every $m_j$: $$n_j \sim \text{Geometric}(p)$$ where

  • $p$ is the proportion of legit users in the population.
  • $n_j$ is the number of users (scammers + the real user) claiming to have MAC address $m_j$

    For every $w_i$: $$\begin{cases} \quad λ_1^{(i, j)} \sim \text{Gamma}(α^+_i, 1) \\ ∀ 2 ≤ \ell ≤ n_j, \quad λ_\ell^{(i, j)} \sim \text{Gamma}(α^{-}_i, 1)\end{cases}$$ where

    • $λ_\ell^{(i, j)}$ is the request submission (Poisson) rate of user $i$ (user 1 = real users, others = scammers) on website $w_i$
    • NB1: $\dfrac{λ_1^{(i, j)}}{\sum_{\ell} λ_{\ell}^{(i, j)}}$ follows a $\text{Beta}(α^+_i, (n_j-1) α^-_i)$ distribution, of mean $\frac{α^+_i}{α^+_i + (n_j-1) α^-_i}$
    • NB2: for every user $\ell$: $$\begin{cases} \quad \Big(\frac{λ_1^{(i, j)}}{\sum_{i'} λ_1^{(i', j)}}\Big)_i \sim \text{Dirichlet}\Big(\big(\frac{α^+_i}{\sum_{i'} α^+_{i'}}\big)_i\Big) \\ ∀ 2 ≤ \ell ≤ n_j, \Big(\frac{λ_\ell^{(i, j)}}{\sum_{i'} λ_\ell^{(i', j)}}\Big)_i \sim \text{Dirichlet}\Big(\big(\frac{α^-_i}{\sum_{i'} α^-_{i'}}\big)_i\Big) \end{cases}$$

For every $1 ≤ k' ≤ k$ such that $(w_i, m_j, t_k) ∈ D$: $$t_{k'+1}-t_{k'} \overset{\text{obs}}{\sim} \text{Exp}\Big(\sum_{\ell} λ_\ell^{(i, j)}\Big)$$ where

  • $t_{k'+1}-t_{k'}$ are the interoccurrence times between requests from the MAC address $m_j$ on website $w_i$, exponentially distributed because of the Poisson aggregation property (the Poisson random variables are independent): $$\sum_{\ell} \text{Poisson}(λ_{\ell}^{(i, j)}) ≅ \text{Poisson}\Big(\sum_{\ell} λ_{\ell}^{(i, j)}\Big)$$

Implementation

Synthetic Data

In [ ]:
from faker import Faker
import dumper
In [ ]:
# Proportion of legit (non scammers) users in the population
p_legit = 0.95

def generate_data(nb_users = 1000, nb_websites = 100, p_legit=p_legit, nb_days = 30, seed=1):
    """Generate synthetic data: 
    Input:
    - nb_users users
    - nb_websites websites
    - p_legit probability for a user to be legit
    - seed for reproducibility
    
    Creates a dictionary mapping users (legit and scammers) to mac addresses (scammers use a mac address already used by a legit user) and returns: 

    Output:
    - a list of users (legit and scammers)
    - a list of websites
    - a list of mac addresses
    - a list of visits (mac address, website, timestamp) over nb_days days
    """
    fake = Faker()
    Faker.seed(seed)
    legit_users, scammers = [], []
    key = random.PRNGKey(seed)
    for _ in range(nb_users):
        key, subkey = random.split(key)
        is_legit = random.bernoulli(subkey, p_legit)
        legit_users.append(fake.name()) if is_legit else scammers.append(fake.name())
    
    users = legit_users + scammers
    legit_users = np.array(legit_users)
    scammers = np.array(scammers)
    websites = [fake.domain_name() for i in range(nb_websites)]

    mac_addresses = np.array([fake.mac_address() for i in range(len(legit_users))])

    users_mac = dict(zip(users, mac_addresses))
    for scammer in scammers:
        users_mac[scammer] = np.random.choice(mac_addresses)

    # Proportion of legit users visiting each website
    key, subkey = random.split(key)
    p_legit_website = random.uniform(subkey, (nb_websites,))

    visits = []
    for i, website in enumerate(websites):
        key, subkey = random.split(key)
        nb_visits = random.poisson(subkey, p_legit_website[i] * nb_users)
        
        visiting_legit_users = np.random.choice(legit_users, (nb_visits,))

        visiting_scammers = np.random.choice(scammers, (nb_visits - len(visiting_legit_users),))

        key, subkey = random.split(key)
        poisson_rate_legit = 20
        for user in visiting_legit_users:
            key, subkey = random.split(key)
            nb_visits = random.poisson(subkey, poisson_rate_legit)
            key, subkey = random.split(key)
            timestamps = random.uniform(subkey, (nb_visits,), minval=0, maxval=nb_days)
            visits.extend([(users_mac[user], website, timestamp) for timestamp in timestamps])

        key, subkey = random.split(key)
        poisson_rate_scam = 30
        for user in visiting_scammers:
            key, subkey = random.split(key)
            nb_visits = random.poisson(subkey, poisson_rate_scam)
            key, subkey = random.split(key)
            timestamps = random.uniform(subkey, (nb_visits,), minval=0, maxval=nb_days)
            visits.extend([(users_mac[user], website, timestamp) for timestamp in timestamps])

    return users, websites, mac_addresses, visits
In [ ]:
from numpyro import distributions as dist, infer

numpyro.set_host_device_count(2)


def model(websites, mac_addresses, visits, p_legit):
    for j, m_j in enumerate(mac_addresses):
        # sample n_j from a Geometric distribution
        n_j = numpyro.sample(f"n_{j}", dist.Geometric(p_legit))

        λ = {}

        # For every website, sample a vector of alpha (shape parameters) normally distributed with mean 15 and std 5 (for legit users) and mean 20 and std 5 (for scammers)
        
        αs_legit = numpyro.sample(f"αs_legit", dist.Normal(15 * jnp.ones(len(websites)), 5 * jnp.ones(len(websites))))

        αs_scam = numpyro.sample(f"αs_scam", dist.Normal(20 * jnp.ones(len(websites)), 5 * jnp.ones(len(websites))))
        
        for i, w_i in enumerate(websites):
            # sample λ rate of legit user from a Gamma distribution
            λ[(i, j, 0)] = numpyro.sample(f"λ_{(i,j,1)}", dist.Gamma(αs_legit[i], 1))
            
            for l in range(1, n_j):
                # sample λ rates of scammers user from a Gamma distribution
                λ[(i, j, l)] = numpyro.sample(f"λ_{(i,j,l)}", dist.Gamma(αs_scam[i], 1))
            
            # Collect timestamps of visits of user j on website i
            timestamps = jnp.sort([visit[2] for visit in visits if visit[0] == m_j and visit[1] == w_i])

            # Compute time intervals between visits
            time_intervals = jnp.diff(timestamps)
            sum_λ = jnp.sum(jnp.array([λ[(i, j, l)] for l in range(n_j)]))

            numpyro.sample("time_intervals", dist.Exp(sum_λ), obs=time_intervals)
In [ ]:
users, websites, mac_addresses, visits = generate_data()

sampler = infer.MCMC(
    infer.NUTS(model),
    num_warmup=100,
    num_samples=500,
    num_chains=2,
    progress_bar=True,
)

%time sampler.run(random.PRNGKey(0), websites, mac_addresses, visits, p_legit)