package me.mrletsplay.scraperbot; import java.net.URI; import java.net.URISyntaxException; import java.net.http.HttpClient; import java.net.http.HttpClient.Redirect; import java.net.http.HttpClient.Version; import java.net.http.HttpRequest; import java.net.http.HttpResponse.BodyHandlers; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Objects; import java.util.Random; import java.util.Set; import java.util.UUID; import java.util.concurrent.Future; import java.util.stream.Collectors; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import net.dv8tion.jda.api.entities.Message; import net.dv8tion.jda.api.entities.channel.middleman.MessageChannel; public class ScraperTask { private static final HttpClient CLIENT = HttpClient.newBuilder() .version(Version.HTTP_2) .followRedirects(Redirect.ALWAYS) .build(); private String id; private String firstURL; private Strategy strategy; private String cookie, // For CF bypass, login etc. userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0"; private FilterList exclude; private FilterList include; private FilterList find; private Future future; private long startTime; public ScraperTask(String firstURL) { this.id = UUID.randomUUID().toString(); this.firstURL = firstURL; this.strategy = Strategy.RANDOM; this.exclude = new FilterList(); this.include = new FilterList(); this.find = new FilterList(); this.include.getPrefix().add(firstURL); } public String getID() { return id; } public String getFirstURL() { return firstURL; } public void setStrategy(Strategy strategy) { this.strategy = strategy; } public Strategy getStrategy() { return strategy; } public void setCookie(String cookie) { this.cookie = cookie; } public String getCookie() { return cookie; } public void setUserAgent(String userAgent) { this.userAgent = userAgent; } public String getUserAgent() { return userAgent; } public FilterList getExclude() { return exclude; } public FilterList getInclude() { return include; } public FilterList getFind() { return find; } public long getStartTime() { return startTime; } public void start(MessageChannel channel) { if(future != null) return; this.startTime = System.currentTimeMillis(); future = ScraperBot.EXECUTOR.submit(() -> { try { Set visited = new HashSet<>(); Set toVisit = new HashSet<>(); Set found = new HashSet<>(); toVisit.add(firstURL); Random r = new Random(); while(!toVisit.isEmpty()) { String url; switch(strategy) { case BREADTH_FIRST: url = toVisit.iterator().next(); break; case RANDOM: default: url = new ArrayList<>(toVisit).get(r.nextInt(toVisit.size())); break; } System.out.println(toVisit.size() + " " + url); // TODO: show in info command toVisit.remove(url); if(!visited.add(url)) { System.out.println("Deja vu, I've been in this place before"); continue; } Set links = getLinks(url); if(Thread.currentThread().isInterrupted()) break; links.removeAll(visited); links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add); Set newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet()); System.out.println("FOUND: " + newFound); // TODO: show in info command newFound.removeAll(found); found.addAll(newFound); if(!newFound.isEmpty()) { Utils.splitToLength(newFound.stream().collect(Collectors.joining("\n")), Message.MAX_CONTENT_LENGTH, Utils.SPLIT_NEWLINES).stream() .forEach(s -> channel.sendMessage(s).queue()); } Thread.sleep(1000); } }catch(InterruptedException e) { }catch(Exception e) { channel.sendMessage("Failed: " + e.toString()).queue(); } ScraperBot.TASKS.remove(this); channel.sendMessage("Done!").queue(); }); } public void cancel() { if(future != null) future.cancel(true); } private static String clean(String uri) throws URISyntaxException { URI u = URI.create(uri); return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString(); } private Set getLinks(String url) throws Exception { for(int i = 0; i < 5; i++) { try { HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url)) .header("User-Agent", userAgent); if(cookie != null) b.header("Cookie", cookie); var r = CLIENT.send(b.build(), BodyHandlers.ofString()); if(r.statusCode() != 200) { if(r.statusCode() == 404) continue; System.out.println("RETRY"); throw new RuntimeException("Failed"); } Document d = Jsoup.parse(r.body(), url); Set linksOnPage = d.getElementsByTag("a").stream().map(a -> { try { return clean(a.attr("abs:href")); } catch (URISyntaxException e) { e.printStackTrace(); return null; } }) .filter(Objects::nonNull) .collect(Collectors.toSet()); return linksOnPage; }catch(InterruptedException e) { Thread.currentThread().interrupt(); return Collections.emptySet(); }catch(Exception e) { Thread.sleep(1000); // e.printStackTrace(); } } return Collections.emptySet(); } public static enum Strategy { // DEPTH_FIRST("Depth-First Search"), TODO: implement BREADTH_FIRST("Breadth-First Search"), RANDOM("Random"), ; public final String friendlyName; private Strategy(String friendlyName) { this.friendlyName = friendlyName; } public String getFriendlyName() { return friendlyName; } } }