235 lines
5.6 KiB
Java
235 lines
5.6 KiB
Java
package me.mrletsplay.scraperbot;
|
|
|
|
import java.net.URI;
|
|
import java.net.URISyntaxException;
|
|
import java.net.http.HttpClient;
|
|
import java.net.http.HttpClient.Redirect;
|
|
import java.net.http.HttpClient.Version;
|
|
import java.net.http.HttpRequest;
|
|
import java.net.http.HttpResponse.BodyHandlers;
|
|
import java.util.ArrayList;
|
|
import java.util.Collections;
|
|
import java.util.HashSet;
|
|
import java.util.Objects;
|
|
import java.util.Random;
|
|
import java.util.Set;
|
|
import java.util.UUID;
|
|
import java.util.concurrent.Future;
|
|
import java.util.stream.Collectors;
|
|
|
|
import org.jsoup.Jsoup;
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import net.dv8tion.jda.api.entities.Message;
|
|
import net.dv8tion.jda.api.entities.channel.middleman.MessageChannel;
|
|
|
|
public class ScraperTask {
|
|
|
|
private static final HttpClient CLIENT = HttpClient.newBuilder()
|
|
.version(Version.HTTP_2)
|
|
.followRedirects(Redirect.ALWAYS)
|
|
.build();
|
|
|
|
private String id;
|
|
private String firstURL;
|
|
private Strategy strategy;
|
|
|
|
private String
|
|
cookie, // For CF bypass, login etc.
|
|
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0";
|
|
|
|
private FilterList exclude;
|
|
private FilterList include;
|
|
private FilterList find;
|
|
|
|
private Future<?> future;
|
|
private long startTime;
|
|
|
|
public ScraperTask(String firstURL) {
|
|
this.id = UUID.randomUUID().toString();
|
|
this.firstURL = firstURL;
|
|
this.strategy = Strategy.RANDOM;
|
|
this.exclude = new FilterList();
|
|
this.include = new FilterList();
|
|
this.find = new FilterList();
|
|
|
|
this.include.getPrefix().add(firstURL);
|
|
}
|
|
|
|
public String getID() {
|
|
return id;
|
|
}
|
|
|
|
public String getFirstURL() {
|
|
return firstURL;
|
|
}
|
|
|
|
public void setStrategy(Strategy strategy) {
|
|
this.strategy = strategy;
|
|
}
|
|
|
|
public Strategy getStrategy() {
|
|
return strategy;
|
|
}
|
|
|
|
public void setCookie(String cookie) {
|
|
this.cookie = cookie;
|
|
}
|
|
|
|
public String getCookie() {
|
|
return cookie;
|
|
}
|
|
|
|
public void setUserAgent(String userAgent) {
|
|
this.userAgent = userAgent;
|
|
}
|
|
|
|
public String getUserAgent() {
|
|
return userAgent;
|
|
}
|
|
|
|
public FilterList getExclude() {
|
|
return exclude;
|
|
}
|
|
|
|
public FilterList getInclude() {
|
|
return include;
|
|
}
|
|
|
|
public FilterList getFind() {
|
|
return find;
|
|
}
|
|
|
|
public long getStartTime() {
|
|
return startTime;
|
|
}
|
|
|
|
public void start(MessageChannel channel) {
|
|
if(future != null) return;
|
|
|
|
this.startTime = System.currentTimeMillis();
|
|
|
|
future = ScraperBot.EXECUTOR.submit(() -> {
|
|
try {
|
|
Set<String> visited = new HashSet<>();
|
|
Set<String> toVisit = new HashSet<>();
|
|
Set<String> found = new HashSet<>();
|
|
|
|
toVisit.add(firstURL);
|
|
|
|
Random r = new Random();
|
|
|
|
while(!toVisit.isEmpty()) {
|
|
String url;
|
|
switch(strategy) {
|
|
case BREADTH_FIRST:
|
|
url = toVisit.iterator().next();
|
|
break;
|
|
case RANDOM:
|
|
default:
|
|
url = new ArrayList<>(toVisit).get(r.nextInt(toVisit.size()));
|
|
break;
|
|
}
|
|
|
|
System.out.println(toVisit.size() + " " + url); // TODO: show in info command
|
|
toVisit.remove(url);
|
|
if(!visited.add(url)) {
|
|
System.out.println("Deja vu, I've been in this place before");
|
|
continue;
|
|
}
|
|
|
|
Set<String> links = getLinks(url);
|
|
if(Thread.currentThread().isInterrupted()) break;
|
|
links.removeAll(visited);
|
|
links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add);
|
|
|
|
Set<String> newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet());
|
|
System.out.println("FOUND: " + newFound); // TODO: show in info command
|
|
newFound.removeAll(found);
|
|
found.addAll(newFound);
|
|
if(!newFound.isEmpty()) {
|
|
Utils.splitToLength(newFound.stream().collect(Collectors.joining("\n")), Message.MAX_CONTENT_LENGTH, Utils.SPLIT_NEWLINES).stream()
|
|
.forEach(s -> channel.sendMessage(s).queue());
|
|
}
|
|
|
|
Thread.sleep(1000);
|
|
}
|
|
}catch(InterruptedException e) {
|
|
}catch(Exception e) {
|
|
channel.sendMessage("Failed: " + e.toString()).queue();
|
|
}
|
|
|
|
ScraperBot.TASKS.remove(this);
|
|
channel.sendMessage("Done!").queue();
|
|
});
|
|
}
|
|
|
|
public void cancel() {
|
|
if(future != null) future.cancel(true);
|
|
}
|
|
|
|
private static String clean(String uri) throws URISyntaxException {
|
|
URI u = URI.create(uri);
|
|
return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString();
|
|
}
|
|
|
|
private Set<String> getLinks(String url) throws Exception {
|
|
for(int i = 0; i < 5; i++) {
|
|
try {
|
|
HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url))
|
|
.header("User-Agent", userAgent);
|
|
|
|
if(cookie != null) b.header("Cookie", cookie);
|
|
|
|
var r = CLIENT.send(b.build(), BodyHandlers.ofString());
|
|
|
|
if(r.statusCode() != 200) {
|
|
if(r.statusCode() == 404) continue;
|
|
|
|
System.out.println("RETRY");
|
|
throw new RuntimeException("Failed");
|
|
}
|
|
|
|
Document d = Jsoup.parse(r.body(), url);
|
|
Set<String> linksOnPage = d.getElementsByTag("a").stream().map(a -> {
|
|
try {
|
|
return clean(a.attr("abs:href"));
|
|
} catch (URISyntaxException e) {
|
|
e.printStackTrace();
|
|
return null;
|
|
}
|
|
})
|
|
.filter(Objects::nonNull)
|
|
.collect(Collectors.toSet());
|
|
return linksOnPage;
|
|
}catch(InterruptedException e) {
|
|
Thread.currentThread().interrupt();
|
|
return Collections.emptySet();
|
|
}catch(Exception e) {
|
|
Thread.sleep(1000);
|
|
// e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
return Collections.emptySet();
|
|
}
|
|
|
|
public static enum Strategy {
|
|
// DEPTH_FIRST("Depth-First Search"), TODO: implement
|
|
BREADTH_FIRST("Breadth-First Search"),
|
|
RANDOM("Random"),
|
|
;
|
|
|
|
public final String friendlyName;
|
|
|
|
private Strategy(String friendlyName) {
|
|
this.friendlyName = friendlyName;
|
|
}
|
|
|
|
public String getFriendlyName() {
|
|
return friendlyName;
|
|
}
|
|
}
|
|
|
|
}
|