mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Update index.html for search engine (#25)
Co-authored-by: vlofgren <vlofgren@gmail.com> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/25
This commit is contained in:
parent
9474f39225
commit
5c2f2d558f
18
README.md
18
README.md
@ -3,14 +3,18 @@
|
|||||||
This is the source code for marginalia.nu, including the [search engine](https://search.marginalia.nu),
|
This is the source code for marginalia.nu, including the [search engine](https://search.marginalia.nu),
|
||||||
the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedia service](https://encyclopedia.marginalia.nu).
|
the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedia service](https://encyclopedia.marginalia.nu).
|
||||||
|
|
||||||
The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu),
|
The aim of the project is to develop new and alternative discovery methods for the Internet.
|
||||||
it is fine to mirror it on other hosts, but if you have issues or questions
|
It's an experimental workshop as much as it is a public service, the overarching goal is to
|
||||||
that is where you want to go.
|
elevate the more human, non-commercial sides of the Internet.
|
||||||
|
|
||||||
As it stands now, the project is a bit of a mess as it wasn't developed
|
The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu).
|
||||||
with the intention of going open source, a lot of tests and so on make
|
It is fine to mirror it on other hosts, but if you have issues or questions
|
||||||
assumptions about the directory structure, much configuration is hard coded
|
git.marginalia.nu is where you want to go.
|
||||||
and so on. Please stand by. A lot of the mess is fairly superficial.
|
|
||||||
|
As it stands now, the project is still being set up and is a bit of a mess as
|
||||||
|
it wasn't developed with the intention of going open source, a lot of tests
|
||||||
|
and so on make assumptions about the directory structure, much configuration
|
||||||
|
is hard coded and so on. Please stand by. A lot of the mess is fairly superficial.
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
|
@ -66,6 +66,6 @@ public class RateLimiter {
|
|||||||
private Bucket createBucket() {
|
private Bucket createBucket() {
|
||||||
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
var refill = Refill.greedy(1, Duration.ofSeconds(refillRate));
|
||||||
var bw = Bandwidth.classic(capacity, refill);
|
var bw = Bandwidth.classic(capacity, refill);
|
||||||
return Bucket4j.builder().addLimit(bw).build();
|
return Bucket.builder().addLimit(bw).build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -331,8 +331,6 @@ public class EdgeIndexService extends Service {
|
|||||||
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
||||||
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
||||||
|
|
||||||
boolean debug = sq.searchTermsExclude.contains("special:debug");
|
|
||||||
|
|
||||||
for (int i : specBuckets) {
|
for (int i : specBuckets) {
|
||||||
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
||||||
|
|
||||||
@ -341,28 +339,6 @@ public class EdgeIndexService extends Service {
|
|||||||
|
|
||||||
List<EdgeSearchResultItem> resultsForBucket = new ArrayList<>(specs.limitByBucket);
|
List<EdgeSearchResultItem> resultsForBucket = new ArrayList<>(specs.limitByBucket);
|
||||||
|
|
||||||
if (debug) {
|
|
||||||
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
|
||||||
.peek(l -> logger.info("Considering {}", Long.toHexString(l)))
|
|
||||||
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
|
||||||
.filter(ri -> {
|
|
||||||
if (seenResults.contains(ri.url.getId())) {
|
|
||||||
logger.info("Seen before: {}", Integer.toHexString(ri.url.getId()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else if (!localFilter.test(i, domainCountFilter, ri)) {
|
|
||||||
logger.info("DCF: {} - {}:{}", ri.blockId, Integer.toHexString(ri.domain.getId()), Integer.toHexString(ri.url.getId()));
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
})
|
|
||||||
.limit(specs.limitTotal * 3L)
|
|
||||||
.distinct()
|
|
||||||
.limit(Math.min(specs.limitByBucket
|
|
||||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
|
||||||
.forEach(resultsForBucket::add);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
||||||
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
||||||
.filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri))
|
.filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri))
|
||||||
@ -371,7 +347,7 @@ public class EdgeIndexService extends Service {
|
|||||||
.limit(Math.min(specs.limitByBucket
|
.limit(Math.min(specs.limitByBucket
|
||||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
||||||
.forEach(resultsForBucket::add);
|
.forEach(resultsForBucket::add);
|
||||||
}
|
|
||||||
|
|
||||||
for (var result : resultsForBucket) {
|
for (var result : resultsForBucket) {
|
||||||
seenResults.add(result.url.getId());
|
seenResults.add(result.url.getId());
|
||||||
|
@ -61,8 +61,12 @@
|
|||||||
existed. </p>
|
existed. </p>
|
||||||
<p>
|
<p>
|
||||||
The software for this search engine is all custom-built, and all crawling and indexing is
|
The software for this search engine is all custom-built, and all crawling and indexing is
|
||||||
done in-house.
|
done in-house. The project is open source. Feel free to poke about in the <a
|
||||||
|
href="https://git.marginalia.nu/marginalia/marginalia.nu">source code</a> or contribute
|
||||||
|
to the development!
|
||||||
</p>
|
</p>
|
||||||
|
<p>Consider <a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">supporting the
|
||||||
|
project</a>!</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="utils">
|
<div class="utils">
|
||||||
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
|
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
|
||||||
@ -98,11 +102,6 @@
|
|||||||
<section class="card">
|
<section class="card">
|
||||||
<h2>Updates</h2>
|
<h2>Updates</h2>
|
||||||
<div class="info">
|
<div class="info">
|
||||||
<p>☛ The web design of the search engine has been completely overhauled. For the most part, this should
|
|
||||||
result in even smaller page loads, and better accessibility and easier navigation, but it may still
|
|
||||||
be a bit rough in some browsers, if you do find any bugs or accessibility problems, please let me
|
|
||||||
know. You can reach me at <tt><a href="mailto://kontakt@marginalia.nu">kontakt@marginalia.nu</a></tt>.
|
|
||||||
</p>
|
|
||||||
<p>☛ The <a href="https://search.marginalia.nu/explore/random">Random Mode</a> has been overhauled, and is
|
<p>☛ The <a href="https://search.marginalia.nu/explore/random">Random Mode</a> has been overhauled, and is
|
||||||
quite entertaining. I encourage you to give it a spin. </p>
|
quite entertaining. I encourage you to give it a spin. </p>
|
||||||
<p>☛ A simple <a href="https://api.marginalia.nu/">public API</a> is now available.</p>
|
<p>☛ A simple <a href="https://api.marginalia.nu/">public API</a> is now available.</p>
|
||||||
@ -116,6 +115,8 @@
|
|||||||
<h2>Publicity, Discussion and Events</h2>
|
<h2>Publicity, Discussion and Events</h2>
|
||||||
<div class="info">
|
<div class="info">
|
||||||
<dl>
|
<dl>
|
||||||
|
<dt><a href="https://news.ycombinator.com/item?id=31536626" rel="nofollow">Marginalia Goes Open Source</a></dt>
|
||||||
|
<dd>Hacker News, 2022-05-28</dd>
|
||||||
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> 🎞️</dt>
|
<dt><a href="https://www.youtube.com/watch?v=rTSEr0cRJY8" rel="nofollow">You Should Check Out the Indie Web</a> 🎞️</dt>
|
||||||
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
|
<dd>YouTube, You've Got Kat, 2022-03-15 </dd>
|
||||||
<dt>
|
<dt>
|
||||||
@ -137,10 +138,10 @@
|
|||||||
</dt>
|
</dt>
|
||||||
<dd>Clive Thompson OneZero, 2021-09-16</dd>
|
<dd>Clive Thompson OneZero, 2021-09-16</dd>
|
||||||
<dt>
|
<dt>
|
||||||
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow">Hacker News Discussion</a>
|
<a href="https://news.ycombinator.com/item?id=28550764" rel="nofollow"> A search engine that favors text-heavy sites and punishes modern web design</a>
|
||||||
</dt>
|
</dt>
|
||||||
<dd>
|
<dd>
|
||||||
2021-09-16
|
Hacker News, 2021-09-16
|
||||||
</dd>
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user