From 3ca1cec8bbff0846784e8ccb51204eb3ca78688d Mon Sep 17 00:00:00 2001 From: Emelie Graven Date: Tue, 5 Apr 2022 13:11:32 +0200 Subject: [PATCH] Formalize planning document, update DB logic --- README.md | 31 +++++++++++++++++++ ingest/src/db.rs | 78 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 91 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index e69de29..92f877b 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,31 @@ +# Grimhilde + +## Usecase +At dotsrc, mirror statistics are currently generated by the [mirror_stats](https://gitlab.com/dotSRC/mirror_stats) go program. While this approach works decently, it could do with improved granularity, and working with the very large json blobs it generates can be taxing. + +Grimhilde is a proposed replacement, currently in development. + +## Planning & Structure +By using the `log_format` directive in our nginx configuration, the access log output can be +customized to include only the data we need in machine-readable form rather than the default +hunman-readable form. Different logging facilities can also be configured, including logging over +the syslog protocol. Grimhilde emulates a syslog server by opening a local UDP socket, enabling fast +and reliable communication between nginx and Grimhilde. + +Each request processed by nginx is sent to Grimhilde, where its details are stored in a Postgresql +database. As the DotSrc mirror processes millions of requests each day, the amount of data we need +to store would quickly become untenable if stored traditionally. The proposed way around this is to +assign each unique piece of data in every request (request path, referrer, hostname, etc) an ID, and +only store it once, replacing any subsequent identical data with references to the initial copy. + +While this will massively decrease the amount of data stored, the database will still grow large +with time. The proposed solution to this is to limit the length of time the data is stored, and to +use the API to generate static views of interest prior to purging data older than f.x. three months. +This will allow staff members to generate advanced reports on realtime data to see and respond to +problematic trends, while also retaining interesting historical usage data to be published publicly +for years to come. + +## Todo +* [ ] Write tests +* [ ] Implement the GraphQL API for fetching statistics +* [ ] Find an appropriate solution for generating reports based on data from the API diff --git a/ingest/src/db.rs b/ingest/src/db.rs index f28a99c..e33cff5 100644 --- a/ingest/src/db.rs +++ b/ingest/src/db.rs @@ -20,26 +20,66 @@ pub async fn insert(pool: &PgPool, req: Request) -> Result<(), Box let mut trans = pool.begin().await?; // Get the id's for deduplicated values. Insert a value if one is missing. - let path_id = sqlx::query!( - "INSERT INTO paths (path) VALUES ($1) ON CONFLICT DO NOTHING RETURNING id", + sqlx::query!( + "INSERT INTO paths (path) VALUES ($1) ON CONFLICT DO NOTHING", &req.path, - ).fetch_one(&mut *trans).await?.id; - let remote_id = sqlx::query!( - "INSERT INTO remote_addresses (addr) VALUES ($1) ON CONFLICT DO NOTHING RETURNING id", - req.addr) - .fetch_one(&mut *trans).await?.id; - let host_id = sqlx::query!( - "INSERT INTO hosts (host) VALUES ($1) ON CONFLICT DO NOTHING RETURNING id", - &req.host) - .fetch_one(&mut *trans).await?.id; - let user_agent_id = sqlx::query!( - "INSERT INTO user_agents (agent) VALUES ($1) ON CONFLICT DO NOTHING RETURNING id", + ) + .execute(&mut *trans) + .await?; + sqlx::query!( + "INSERT INTO remote_addresses (addr) VALUES ($1) ON CONFLICT DO NOTHING", + &req.addr + ) + .execute(&mut *trans) + .await?; + sqlx::query!( + "INSERT INTO hosts (host) VALUES ($1) ON CONFLICT DO NOTHING", + &req.host + ) + .execute(&mut *trans) + .await?; + sqlx::query!( + "INSERT INTO user_agents (agent) VALUES ($1) ON CONFLICT DO NOTHING", &req.user_agent, - ).fetch_one(&mut *trans).await?.id; - let referrer_id = sqlx::query!( - "INSERT INTO referrers (referrer) VALUES ($1) ON CONFLICT DO NOTHING RETURNING id", + ) + .execute(&mut *trans) + .await?; + sqlx::query!( + "INSERT INTO referrers (referrer) VALUES ($1) ON CONFLICT DO NOTHING", &req.referrer, - ).fetch_one(&mut *trans).await?.id; + ) + .execute(&mut *trans) + .await?; + + let path_id = sqlx::query!("SELECT id FROM paths WHERE path = ($1)", &req.path) + .fetch_one(&mut *trans) + .await? + .id; + let path_id = sqlx::query!( + "SELECT id FROM remote_addresses WHERE addr = ($1)", + &req.addr + ) + .fetch_one(&mut *trans) + .await? + .id; + let path_id = sqlx::query!("SELECT id FROM hosts WHERE host = ($1)", &req.host) + .fetch_one(&mut *trans) + .await? + .id; + let path_id = sqlx::query!( + "SELECT id FROM user_agents WHERE agent = ($1)", + &req.user_agent + ) + .fetch_one(&mut *trans) + .await? + .id; + let path_id = sqlx::query!( + "SELECT id FROM referrers WHERE referrer = ($1)", + &req.referrer + ) + .fetch_one(&mut *trans) + .await? + .id; sqlx::query!( "INSERT INTO hits (size, path, remote, host, user_agent, referrer, secure, time) \ @@ -52,7 +92,9 @@ pub async fn insert(pool: &PgPool, req: Request) -> Result<(), Box referrer_id, req.secure, req.timestamp, - ).execute(&mut *trans).await?; + ) + .execute(&mut *trans) + .await?; trans.commit().await?; Ok(())