2023-11-01 15:44:01 +01:00

235 lines
5.6 KiB
Java

package me.mrletsplay.scraperbot;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpClient.Redirect;
import java.net.http.HttpClient.Version;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse.BodyHandlers;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import net.dv8tion.jda.api.entities.Message;
import net.dv8tion.jda.api.entities.channel.middleman.MessageChannel;
public class ScraperTask {
private static final HttpClient CLIENT = HttpClient.newBuilder()
.version(Version.HTTP_2)
.followRedirects(Redirect.ALWAYS)
.build();
private String id;
private String firstURL;
private Strategy strategy;
private String
cookie, // For CF bypass, login etc.
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0";
private FilterList exclude;
private FilterList include;
private FilterList find;
private Future<?> future;
private long startTime;
public ScraperTask(String firstURL) {
this.id = UUID.randomUUID().toString();
this.firstURL = firstURL;
this.strategy = Strategy.RANDOM;
this.exclude = new FilterList();
this.include = new FilterList();
this.find = new FilterList();
this.include.getPrefix().add(firstURL);
}
public String getID() {
return id;
}
public String getFirstURL() {
return firstURL;
}
public void setStrategy(Strategy strategy) {
this.strategy = strategy;
}
public Strategy getStrategy() {
return strategy;
}
public void setCookie(String cookie) {
this.cookie = cookie;
}
public String getCookie() {
return cookie;
}
public void setUserAgent(String userAgent) {
this.userAgent = userAgent;
}
public String getUserAgent() {
return userAgent;
}
public FilterList getExclude() {
return exclude;
}
public FilterList getInclude() {
return include;
}
public FilterList getFind() {
return find;
}
public long getStartTime() {
return startTime;
}
public void start(MessageChannel channel) {
if(future != null) return;
this.startTime = System.currentTimeMillis();
future = ScraperBot.EXECUTOR.submit(() -> {
try {
Set<String> visited = new HashSet<>();
Set<String> toVisit = new HashSet<>();
Set<String> found = new HashSet<>();
toVisit.add(firstURL);
Random r = new Random();
while(!toVisit.isEmpty()) {
String url;
switch(strategy) {
case BREADTH_FIRST:
url = toVisit.iterator().next();
break;
case RANDOM:
default:
url = new ArrayList<>(toVisit).get(r.nextInt(toVisit.size()));
break;
}
System.out.println(toVisit.size() + " " + url); // TODO: show in info command
toVisit.remove(url);
if(!visited.add(url)) {
System.out.println("Deja vu, I've been in this place before");
continue;
}
Set<String> links = getLinks(url);
if(Thread.currentThread().isInterrupted()) break;
links.removeAll(visited);
links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add);
Set<String> newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet());
System.out.println("FOUND: " + newFound); // TODO: show in info command
newFound.removeAll(found);
found.addAll(newFound);
if(!newFound.isEmpty()) {
Utils.splitToLength(newFound.stream().collect(Collectors.joining("\n")), Message.MAX_CONTENT_LENGTH, Utils.SPLIT_NEWLINES).stream()
.forEach(s -> channel.sendMessage(s).queue());
}
Thread.sleep(1000);
}
}catch(InterruptedException e) {
}catch(Exception e) {
channel.sendMessage("Failed: " + e.toString()).queue();
}
ScraperBot.TASKS.remove(this);
channel.sendMessage("Done!").queue();
});
}
public void cancel() {
if(future != null) future.cancel(true);
}
private static String clean(String uri) throws URISyntaxException {
URI u = URI.create(uri);
return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString();
}
private Set<String> getLinks(String url) throws Exception {
for(int i = 0; i < 5; i++) {
try {
HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url))
.header("User-Agent", userAgent);
if(cookie != null) b.header("Cookie", cookie);
var r = CLIENT.send(b.build(), BodyHandlers.ofString());
if(r.statusCode() != 200) {
if(r.statusCode() == 404) continue;
System.out.println("RETRY");
throw new RuntimeException("Failed");
}
Document d = Jsoup.parse(r.body(), url);
Set<String> linksOnPage = d.getElementsByTag("a").stream().map(a -> {
try {
return clean(a.attr("abs:href"));
} catch (URISyntaxException e) {
e.printStackTrace();
return null;
}
})
.filter(Objects::nonNull)
.collect(Collectors.toSet());
return linksOnPage;
}catch(InterruptedException e) {
Thread.currentThread().interrupt();
return Collections.emptySet();
}catch(Exception e) {
Thread.sleep(1000);
// e.printStackTrace();
}
}
return Collections.emptySet();
}
public static enum Strategy {
// DEPTH_FIRST("Depth-First Search"), TODO: implement
BREADTH_FIRST("Breadth-First Search"),
RANDOM("Random"),
;
public final String friendlyName;
private Strategy(String friendlyName) {
this.friendlyName = friendlyName;
}
public String getFriendlyName() {
return friendlyName;
}
}
}