From ba724bc1b2e756ce48edfb14cfe250fc32953f04 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@marginalia.nu>
Date: Tue, 1 Aug 2023 22:47:37 +0200
Subject: [PATCH] (scripts|docs) Update scripts and documentations for the new
 operator's gui and file storage workflows.

---
 doc/crawling.md         | 75 ++++++++++-----------------------
 run/download-samples.sh | 40 ++++++++++++++++++
 run/readme.md           | 26 +++++++++++-
 run/reconvert.sh        | 91 -----------------------------------------
 4 files changed, 86 insertions(+), 146 deletions(-)
 create mode 100755 run/download-samples.sh
 delete mode 100755 run/reconvert.sh

diff --git a/doc/crawling.md b/doc/crawling.md
index 46f476b9..cfb38f9e 100644
--- a/doc/crawling.md
+++ b/doc/crawling.md
@@ -1,6 +1,6 @@
 # Crawling
 
-This document is a first draft.
+This document is a draft.
 
 ## WARNING
 Please don't run the crawler unless you intend to actually operate a public
@@ -23,6 +23,11 @@ it doesn't need to be extremely fast, but it should be a few terabytes in size.
 with `noatime` and partitioned with a large block size.  It may be a good idea to format the disk with 
 a block size of 4096 bytes.  This will reduce the amount of disk space used by the crawler.
 
+Make sure you configure the user-agent properly.  This will be used to identify the crawler,
+and is matched against the robots.txt file.  The crawler will not crawl sites that don't allow it.
+
+This can be done by editing the file `${WMSA_HOME}/conf/user-agent`.
+
 ## Setup
 
 To operate the crawler, you need to set up a filesystem structure.
@@ -45,66 +50,28 @@ $ mkdir /data/processed
 ### Specifications
 
 A crawl specification file is a compressed JSON file with each domain name to crawl, as well as
-known URLs for each domain.  These are created with the [crawl-job-extractor](../tools/crawl-job-extractor/)
-tool.
+known URLs for each domain.  These are created in the `storage -> specifications` view in the operator's gui.
 
-Let's put this in `/data/crawl.spec`
+To bootstrap the system, you need a list of known domains.  This is just a text file with one domain name per line,
+with blanlines and comments starting with `#` ignored.
 
-### Crawl Plan
-
-You also need a crawl plan. This is a YAML file that specifies where to store the crawl data. This
-file is also used by the converter.
-
-This is an example from production. Note that the crawl specification mentioned previously is pointed
-to by the `jobSpec` key.
-
-```yaml
-jobSpec: "/data/crawl.spec"
-crawl:
-  dir: "/data/crawl"
-  logName: "crawler.log"
-process:
-  dir: "/data/processed"
-  logName: "process.log"
-```
-
-Let's put it in `/data/crawl-plan.yaml`
+Make it available over HTTP(S) and select `Download a list of domains from a URL` in the `Create New Specification`
+form.  Make sure to give this specification a good description, as it will follow you around for  a while.
 
 ## Crawling
 
-Run the crawler-process script with the crawl plan as an argument.
+Refresh the specification list in the operator's gui.  You should see your new specification in the list.
+Click the `[Info]` link next to it and select `[Crawl]` under `Actions`.
 
-In practice something like this:
-
-```bash
-screen sudo -u searchengine WMSA_HOME=/path/to/install/dir ./crawler-process /data/crawl-plan.yaml
-```
-
-This proces will run for a long time, up to a week.  It will journal its progress in `crawler.log`,
-and if the process should halt somehow, it replay the journal and continue where was.  Do give it a 
-while before restarting though, to not annoy webmasters by re-crawling a bunch of websites.
-
-The crawler will populate the crawl directory with a directory structure.  Note that on mechanical drives,
-removing these files will take hours.  You probably want a separate hard drive for this as the filesystem
-will get severely gunked up. 
+Depending on the size of the specification, this may take anywhere between a few minutes to a few weeks. 
+You can follow the progress in the `Actors` view.
 
 ## Converting
 
-The converter process takes the same argument as the crawler process.  It will read the crawl data
-and extract keywords and metadata and save them as compressed JSON models.  It will create another huge
-directory structure in the process directory, and uses its own journal to keep track of progress.
+Once the crawl is finished, you can convert the data to a format that can be loaded into the database.
+This is done by going to the `storage -> crawl` view in the operator's gui, clicking the `[Info]` link
+and pressing `[Convert]` under `Actions`.
 
-```bash
-screen sudo -u searchengine WMSA_HOME=/path/to/install/dir ./converter-process /data/crawl-plan.yaml
-```
-
-**Note:** This process will use *a lot* of CPU.  Expect every available core to be at 100% for several days.
-
-## Loader
-
-The loader process takes the same argument as the crawler and converter processes.  It will read converted
-data and insert it into the database and create a lexicon and index journal.
-
-**Note:** It will wipe the URL database before inserting data.  It is a good idea to 
-bring the entire search-engine offline while this is happening.  The loader will run
-for a day or so. 
\ No newline at end of file
+The rest of the process should be automatic.  Follow the progress in the `Actors` view; the actor
+`RECONVERT_LOAD` drives the process.  The process can be stopped by terminating this actor.  Depending on the
+state, it may be necessary to restart from the beginning.  
\ No newline at end of file
diff --git a/run/download-samples.sh b/run/download-samples.sh
new file mode 100755
index 00000000..2465c50b
--- /dev/null
+++ b/run/download-samples.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -e
+
+SAMPLE_NAME=crawl-${1:-m}
+SAMPLE_DIR="samples/${SAMPLE_NAME}/"
+
+function download_model {
+  model=$1
+  url=$2
+
+  if [ ! -f $model ]; then
+    echo "** Downloading $url"
+    wget -O $model $url
+  fi
+}
+
+pushd $(dirname $0)
+
+if [ -d ${SAMPLE_DIR} ]; then
+    echo "${SAMPLE_DIR} already exists; remove it if you want to re-download the sample"
+fi
+
+mkdir -p samples/
+SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz
+download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL}
+
+if [ ! -f ${SAMPLE_TARBALL} ]; then
+  echo "!! Failed"
+  exit 255
+fi
+
+mkdir -p ${SAMPLE_DIR}
+tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR}
+
+cat > "${SAMPLE_DIR}/marginalia-manifest.json" <<EOF
+{ "description": "Sample data set ${SAMPLE_NAME}", "type": "CRAWL_DATA" }
+EOF
+
+popd
diff --git a/run/readme.md b/run/readme.md
index 9015629b..b27d6ebb 100644
--- a/run/readme.md
+++ b/run/readme.md
@@ -51,7 +51,31 @@ $ docker-compose up
 
 6. Download Sample Data
 
-TODO: How?
+A script is available for downloading sample data. The script will download the
+data from https://downloads.marginalia.nu/ and extract it to the correct location.
+
+The system will pick the data up automatically.
+
+```shell
+$ run/download-samples l
+```
+
+Four sets are available:
+
+| Name | Description                     |
+|------|---------------------------------|
+| s    | Small set, 1000 domains         |
+| m    | Medium set, 2000 domains        |
+| l    | Large set, 5000 domains         |
+| xl   | Extra large set, 50,000 domains |
+
+Warning: The XL set is intended to provide a large amount of data for 
+setting up a pre-production environment. It may be hard to run on a smaller
+machine.  It's barely runnable on a 32GB machine; and total processing time
+is around 5 hours.
+
+The 'l' set is a good compromise between size and processing time and should
+work on most machines.
 
 ## Experiment Runner
 
diff --git a/run/reconvert.sh b/run/reconvert.sh
deleted file mode 100755
index 568af360..00000000
--- a/run/reconvert.sh
+++ /dev/null
@@ -1,91 +0,0 @@
-#!/bin/bash
-
-set -e
-
-SAMPLE_NAME=crawl-${1:-m}
-SAMPLE_DIR="samples/${SAMPLE_NAME}/"
-
-## Configuration
-
-CONVERTER_PROCESS_OPTS="
--ea
--Xmx16G
--XX:-CompactStrings
--XX:+UseParallelGC
--XX:GCTimeRatio=14
--XX:ParallelGCThreads=15
-"
-
-LOADER_PROCESS_OPTS="
--ea
--Dlocal-index-path=vol/iw
-"
-
-JAVA_OPTS="
--Dcrawl.rootDirRewrite=/crawl:${SAMPLE_DIR}
--Ddb.overrideJdbc=jdbc:mariadb://localhost:3306/WMSA_prod?rewriteBatchedStatements=true
-"
-
-## Configuration ends
-
-function download_model {
-  model=$1
-  url=$2
-
-  if [ ! -f $model ]; then
-    echo "** Downloading $url"
-    wget -O $model $url
-  fi
-}
-
-pushd $(dirname $0)
-
-## Upgrade the tools
-
-rm -rf install/loader-process install/converter-process
-tar xf ../code/processes/loading-process/build/distributions/loader-process.tar -C install/
-tar xf ../code/processes/converting-process/build/distributions/converter-process.tar -C install/
-
-## Download the sample if necessary
-
-if [ ! -d ${SAMPLE_DIR} ]; then
-  mkdir -p samples/
-
-  SAMPLE_TARBALL=samples/${SAMPLE_NAME}.tar.gz
-  download_model ${SAMPLE_TARBALL} https://downloads.marginalia.nu/${SAMPLE_TARBALL} || rm ${SAMPLE_TARBALL}
-
-  if [ ! -f ${SAMPLE_TARBALL} ]; then
-    echo "!! Failed"
-    exit 255
-  fi
-
-  mkdir -p samples/${SAMPLE_NAME}
-  if [ ! -f $SAMPLE_DIR/plan.yaml ]; then
-    echo "Uncompressing"
-    tar zxf ${SAMPLE_TARBALL} --strip-components=1 -C ${SAMPLE_DIR}
-  fi
-fi
-
-## Wipe the old index data
-
-rm -f ${SAMPLE_DIR}/process/process.log
-rm -f vol/iw/dictionary.dat
-rm -f vol/iw/index.dat
-
-PATH+=":install/converter-process/bin"
-PATH+=":install/loader-process/bin"
-
-export WMSA_HOME=.
-export PATH
-
-export JAVA_OPTS
-export CONVERTER_PROCESS_OPTS
-export LOADER_PROCESS_OPTS
-
-converter-process ${SAMPLE_DIR}/plan.yaml
-loader-process ${SAMPLE_DIR}/plan.yaml
-
-mv vol/iw/index.dat vol/iw/0/page-index.dat
-rm -f vol/ir/0/*
-
-popd