initial commit
This commit is contained in:
commit
3f4ef49bc3
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
bin
|
||||||
|
target
|
||||||
|
.classpath
|
||||||
|
.settings
|
||||||
|
config.json
|
23
.project
Normal file
23
.project
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<projectDescription>
|
||||||
|
<name>ZapScraper</name>
|
||||||
|
<comment></comment>
|
||||||
|
<projects>
|
||||||
|
</projects>
|
||||||
|
<buildSpec>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.jdt.core.javabuilder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
|
<buildCommand>
|
||||||
|
<name>org.eclipse.m2e.core.maven2Builder</name>
|
||||||
|
<arguments>
|
||||||
|
</arguments>
|
||||||
|
</buildCommand>
|
||||||
|
</buildSpec>
|
||||||
|
<natures>
|
||||||
|
<nature>org.eclipse.m2e.core.maven2Nature</nature>
|
||||||
|
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||||
|
</natures>
|
||||||
|
</projectDescription>
|
50
pom.xml
Normal file
50
pom.xml
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>ZapScraper</groupId>
|
||||||
|
<artifactId>ZapScraper</artifactId>
|
||||||
|
<version>0.0.1-SNAPSHOT</version>
|
||||||
|
<build>
|
||||||
|
<sourceDirectory>src</sourceDirectory>
|
||||||
|
<plugins>
|
||||||
|
<plugin>
|
||||||
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
<version>3.8.1</version>
|
||||||
|
<configuration>
|
||||||
|
<release>20</release>
|
||||||
|
</configuration>
|
||||||
|
</plugin>
|
||||||
|
</plugins>
|
||||||
|
</build>
|
||||||
|
<repositories>
|
||||||
|
<repository>
|
||||||
|
<id>Graphite-Official</id>
|
||||||
|
<url>https://maven.graphite-official.com/releases</url>
|
||||||
|
</repository>
|
||||||
|
</repositories>
|
||||||
|
<dependencies>
|
||||||
|
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.jsoup</groupId>
|
||||||
|
<artifactId>jsoup</artifactId>
|
||||||
|
<version>1.16.2</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>me.mrletsplay</groupId>
|
||||||
|
<artifactId>MrCore</artifactId>
|
||||||
|
<version>4.4</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.dv8tion</groupId>
|
||||||
|
<artifactId>JDA</artifactId>
|
||||||
|
<version>5.0.0-beta.17</version>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>club.minnced</groupId>
|
||||||
|
<artifactId>opus-java</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
</project>
|
37
src/me/mrletsplay/scraperbot/FilterList.java
Normal file
37
src/me/mrletsplay/scraperbot/FilterList.java
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package me.mrletsplay.scraperbot;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class FilterList {
|
||||||
|
|
||||||
|
private List<String> exact;
|
||||||
|
private List<String> prefix;
|
||||||
|
private List<String> regex;
|
||||||
|
|
||||||
|
public FilterList() {
|
||||||
|
this.exact = new ArrayList<>();
|
||||||
|
this.prefix = new ArrayList<>();
|
||||||
|
this.regex = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getExact() {
|
||||||
|
return exact;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getPrefix() {
|
||||||
|
return prefix;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getRegex() {
|
||||||
|
return regex;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean matches(String str) {
|
||||||
|
return exact.contains(str)
|
||||||
|
|| prefix.stream().anyMatch(p -> str.startsWith(p))
|
||||||
|
|| regex.stream().anyMatch(r -> Pattern.matches(r, str));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
131
src/me/mrletsplay/scraperbot/ScraperBot.java
Normal file
131
src/me/mrletsplay/scraperbot/ScraperBot.java
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
package me.mrletsplay.scraperbot;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.ScheduledExecutorService;
|
||||||
|
|
||||||
|
import me.mrletsplay.mrcore.io.IOUtils;
|
||||||
|
import me.mrletsplay.mrcore.json.JSONObject;
|
||||||
|
import me.mrletsplay.mrcore.json.converter.JSONConverter;
|
||||||
|
import me.mrletsplay.mrcore.json.converter.SerializationOption;
|
||||||
|
import me.mrletsplay.scraperbot.command.CommandScrape;
|
||||||
|
import me.mrletsplay.scraperbot.command.SimpleCommand;
|
||||||
|
import net.dv8tion.jda.api.JDA;
|
||||||
|
import net.dv8tion.jda.api.JDABuilder;
|
||||||
|
import net.dv8tion.jda.api.OnlineStatus;
|
||||||
|
import net.dv8tion.jda.api.entities.Activity;
|
||||||
|
import net.dv8tion.jda.api.events.interaction.command.CommandAutoCompleteInteractionEvent;
|
||||||
|
import net.dv8tion.jda.api.events.interaction.command.SlashCommandInteractionEvent;
|
||||||
|
import net.dv8tion.jda.api.hooks.ListenerAdapter;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.Commands;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.SlashCommandData;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.SubcommandData;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.SubcommandGroupData;
|
||||||
|
import net.dv8tion.jda.api.requests.restaction.CommandListUpdateAction;
|
||||||
|
|
||||||
|
public class ScraperBot extends ListenerAdapter {
|
||||||
|
|
||||||
|
public static final Path CONFIG_PATH = Paths.get("config.json");
|
||||||
|
|
||||||
|
public static final ScheduledExecutorService EXECUTOR = Executors.newScheduledThreadPool(0);
|
||||||
|
public static final List<ScraperTask> TASKS = new ArrayList<>();
|
||||||
|
|
||||||
|
public static final List<SimpleCommand> COMMANDS = Arrays.asList(
|
||||||
|
new CommandScrape()
|
||||||
|
);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if(!Files.exists(CONFIG_PATH)) {
|
||||||
|
IOUtils.createFile(CONFIG_PATH.toFile());
|
||||||
|
Files.writeString(CONFIG_PATH, ScraperBotConfig.createDefault().toJSON(SerializationOption.DONT_INCLUDE_CLASS).toFancyString(), StandardCharsets.UTF_8);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String cfgString = Files.readString(Paths.get("config.json"));
|
||||||
|
ScraperBotConfig config = JSONConverter.decodeObject(new JSONObject(cfgString), ScraperBotConfig.class);
|
||||||
|
|
||||||
|
JDA jda = JDABuilder.createDefault(config.getToken())
|
||||||
|
.setStatus(OnlineStatus.IDLE)
|
||||||
|
.setActivity(Activity.watching("you and all of your websites \uD83D\uDD0D"))
|
||||||
|
.addEventListeners(new ScraperBot())
|
||||||
|
.setAutoReconnect(true)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
jda.awaitReady();
|
||||||
|
|
||||||
|
CommandListUpdateAction a;
|
||||||
|
if(config.isUseTestingServer()) {
|
||||||
|
System.out.println("Using testing server: " + config.getTestingServer());
|
||||||
|
a = jda.getGuildById(config.getTestingServer()).updateCommands();
|
||||||
|
}else {
|
||||||
|
a = jda.updateCommands();
|
||||||
|
}
|
||||||
|
|
||||||
|
for(SimpleCommand command : COMMANDS) {
|
||||||
|
SlashCommandData d = Commands.slash(command.getName(), command.getDescription());
|
||||||
|
|
||||||
|
if(!command.getChildren().isEmpty()) {
|
||||||
|
for(SimpleCommand subCommand : command.getChildren()) {
|
||||||
|
if(!subCommand.getChildren().isEmpty()) {
|
||||||
|
SubcommandGroupData gd = new SubcommandGroupData(subCommand.getName(), subCommand.getDescription());
|
||||||
|
|
||||||
|
for(SimpleCommand groupCommand : subCommand.getChildren()) {
|
||||||
|
gd.addSubcommands(new SubcommandData(groupCommand.getName(), groupCommand.getDescription()).addOptions(groupCommand.getOptions()));
|
||||||
|
}
|
||||||
|
|
||||||
|
d.addSubcommandGroups(gd);
|
||||||
|
}else {
|
||||||
|
d.addSubcommands(new SubcommandData(subCommand.getName(), subCommand.getDescription()).addOptions(subCommand.getOptions()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}else {
|
||||||
|
d.addOptions(command.getOptions());
|
||||||
|
}
|
||||||
|
|
||||||
|
a.addCommands(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
a.queue();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onSlashCommandInteraction(SlashCommandInteractionEvent event) {
|
||||||
|
List<String> parts = new ArrayList<>(Arrays.asList(event.getFullCommandName().split(" ")));
|
||||||
|
|
||||||
|
String c = parts.remove(0);
|
||||||
|
SimpleCommand cmd = COMMANDS.stream().filter(cm -> cm.getName().equals(c)).findFirst().orElse(null);
|
||||||
|
if(cmd == null) return;
|
||||||
|
|
||||||
|
while(!parts.isEmpty()) {
|
||||||
|
String sc = parts.remove(0);
|
||||||
|
cmd = cmd.getChildren().stream().filter(ch -> ch.getName().equals(sc)).findFirst().orElse(null);
|
||||||
|
if(cmd == null) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd.action(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void onCommandAutoCompleteInteraction(CommandAutoCompleteInteractionEvent event) {
|
||||||
|
List<String> parts = new ArrayList<>(Arrays.asList(event.getFullCommandName().split(" ")));
|
||||||
|
|
||||||
|
String c = parts.remove(0);
|
||||||
|
SimpleCommand cmd = COMMANDS.stream().filter(cm -> cm.getName().equals(c)).findFirst().orElse(null);
|
||||||
|
if(cmd == null) return;
|
||||||
|
|
||||||
|
while(!parts.isEmpty()) {
|
||||||
|
String sc = parts.remove(0);
|
||||||
|
cmd = cmd.getChildren().stream().filter(ch -> ch.getName().equals(sc)).findFirst().orElse(null);
|
||||||
|
if(cmd == null) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
cmd.complete(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
41
src/me/mrletsplay/scraperbot/ScraperBotConfig.java
Normal file
41
src/me/mrletsplay/scraperbot/ScraperBotConfig.java
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
package me.mrletsplay.scraperbot;
|
||||||
|
|
||||||
|
import me.mrletsplay.mrcore.json.converter.JSONConstructor;
|
||||||
|
import me.mrletsplay.mrcore.json.converter.JSONConvertible;
|
||||||
|
import me.mrletsplay.mrcore.json.converter.JSONValue;
|
||||||
|
|
||||||
|
public class ScraperBotConfig implements JSONConvertible {
|
||||||
|
|
||||||
|
@JSONValue
|
||||||
|
private String token;
|
||||||
|
|
||||||
|
@JSONValue
|
||||||
|
private boolean useTestingServer;
|
||||||
|
|
||||||
|
@JSONValue
|
||||||
|
private String testingServer;
|
||||||
|
|
||||||
|
@JSONConstructor
|
||||||
|
private ScraperBotConfig() {}
|
||||||
|
|
||||||
|
public String getToken() {
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isUseTestingServer() {
|
||||||
|
return useTestingServer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getTestingServer() {
|
||||||
|
return testingServer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ScraperBotConfig createDefault() {
|
||||||
|
ScraperBotConfig config = new ScraperBotConfig();
|
||||||
|
config.token = "Token";
|
||||||
|
config.useTestingServer = false;
|
||||||
|
config.testingServer = "Testing server ID";
|
||||||
|
return config;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
234
src/me/mrletsplay/scraperbot/ScraperTask.java
Normal file
234
src/me/mrletsplay/scraperbot/ScraperTask.java
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
package me.mrletsplay.scraperbot;
|
||||||
|
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpClient.Redirect;
|
||||||
|
import java.net.http.HttpClient.Version;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse.BodyHandlers;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.UUID;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import net.dv8tion.jda.api.entities.Message;
|
||||||
|
import net.dv8tion.jda.api.entities.channel.middleman.MessageChannel;
|
||||||
|
|
||||||
|
public class ScraperTask {
|
||||||
|
|
||||||
|
private static final HttpClient CLIENT = HttpClient.newBuilder()
|
||||||
|
.version(Version.HTTP_2)
|
||||||
|
.followRedirects(Redirect.ALWAYS)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
private String id;
|
||||||
|
private String firstURL;
|
||||||
|
private Strategy strategy;
|
||||||
|
|
||||||
|
private String
|
||||||
|
cookie, // For CF bypass, login etc.
|
||||||
|
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0";
|
||||||
|
|
||||||
|
private FilterList exclude;
|
||||||
|
private FilterList include;
|
||||||
|
private FilterList find;
|
||||||
|
|
||||||
|
private Future<?> future;
|
||||||
|
private long startTime;
|
||||||
|
|
||||||
|
public ScraperTask(String firstURL) {
|
||||||
|
this.id = UUID.randomUUID().toString();
|
||||||
|
this.firstURL = firstURL;
|
||||||
|
this.strategy = Strategy.RANDOM;
|
||||||
|
this.exclude = new FilterList();
|
||||||
|
this.include = new FilterList();
|
||||||
|
this.find = new FilterList();
|
||||||
|
|
||||||
|
this.include.getPrefix().add(firstURL);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getID() {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFirstURL() {
|
||||||
|
return firstURL;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStrategy(Strategy strategy) {
|
||||||
|
this.strategy = strategy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Strategy getStrategy() {
|
||||||
|
return strategy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setCookie(String cookie) {
|
||||||
|
this.cookie = cookie;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getCookie() {
|
||||||
|
return cookie;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUserAgent(String userAgent) {
|
||||||
|
this.userAgent = userAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUserAgent() {
|
||||||
|
return userAgent;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FilterList getExclude() {
|
||||||
|
return exclude;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FilterList getInclude() {
|
||||||
|
return include;
|
||||||
|
}
|
||||||
|
|
||||||
|
public FilterList getFind() {
|
||||||
|
return find;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getStartTime() {
|
||||||
|
return startTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void start(MessageChannel channel) {
|
||||||
|
if(future != null) return;
|
||||||
|
|
||||||
|
this.startTime = System.currentTimeMillis();
|
||||||
|
|
||||||
|
future = ScraperBot.EXECUTOR.submit(() -> {
|
||||||
|
try {
|
||||||
|
Set<String> visited = new HashSet<>();
|
||||||
|
Set<String> toVisit = new HashSet<>();
|
||||||
|
Set<String> found = new HashSet<>();
|
||||||
|
|
||||||
|
toVisit.add(firstURL);
|
||||||
|
|
||||||
|
Random r = new Random();
|
||||||
|
|
||||||
|
while(!toVisit.isEmpty()) {
|
||||||
|
String url;
|
||||||
|
switch(strategy) {
|
||||||
|
case BREADTH_FIRST:
|
||||||
|
url = toVisit.iterator().next();
|
||||||
|
break;
|
||||||
|
case RANDOM:
|
||||||
|
default:
|
||||||
|
url = new ArrayList<>(toVisit).get(r.nextInt(toVisit.size()));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(toVisit.size() + " " + url); // TODO: show in info command
|
||||||
|
toVisit.remove(url);
|
||||||
|
if(!visited.add(url)) {
|
||||||
|
System.out.println("Deja vu, I've been in this place before");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> links = getLinks(url);
|
||||||
|
if(Thread.currentThread().isInterrupted()) break;
|
||||||
|
links.removeAll(visited);
|
||||||
|
links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add);
|
||||||
|
|
||||||
|
Set<String> newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet());
|
||||||
|
System.out.println("FOUND: " + newFound); // TODO: show in info command
|
||||||
|
newFound.removeAll(found);
|
||||||
|
found.addAll(newFound);
|
||||||
|
if(!newFound.isEmpty()) {
|
||||||
|
Utils.splitToLength(newFound.stream().collect(Collectors.joining("\n")), Message.MAX_CONTENT_LENGTH, Utils.SPLIT_NEWLINES).stream()
|
||||||
|
.forEach(s -> channel.sendMessage(s).queue());
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
}catch(InterruptedException e) {
|
||||||
|
}catch(Exception e) {
|
||||||
|
channel.sendMessage("Failed: " + e.toString()).queue();
|
||||||
|
}
|
||||||
|
|
||||||
|
ScraperBot.TASKS.remove(this);
|
||||||
|
channel.sendMessage("Done!").queue();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void cancel() {
|
||||||
|
if(future != null) future.cancel(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String clean(String uri) throws URISyntaxException {
|
||||||
|
URI u = URI.create(uri);
|
||||||
|
return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Set<String> getLinks(String url) throws Exception {
|
||||||
|
for(int i = 0; i < 5; i++) {
|
||||||
|
try {
|
||||||
|
HttpRequest.Builder b = HttpRequest.newBuilder(URI.create(url))
|
||||||
|
.header("User-Agent", userAgent);
|
||||||
|
|
||||||
|
if(cookie != null) b.header("Cookie", cookie);
|
||||||
|
|
||||||
|
var r = CLIENT.send(b.build(), BodyHandlers.ofString());
|
||||||
|
|
||||||
|
if(r.statusCode() != 200) {
|
||||||
|
if(r.statusCode() == 404) continue;
|
||||||
|
|
||||||
|
System.out.println("RETRY");
|
||||||
|
throw new RuntimeException("Failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
Document d = Jsoup.parse(r.body(), url);
|
||||||
|
Set<String> linksOnPage = d.getElementsByTag("a").stream().map(a -> {
|
||||||
|
try {
|
||||||
|
return clean(a.attr("abs:href"));
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
return linksOnPage;
|
||||||
|
}catch(InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
return Collections.emptySet();
|
||||||
|
}catch(Exception e) {
|
||||||
|
Thread.sleep(1000);
|
||||||
|
// e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Collections.emptySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static enum Strategy {
|
||||||
|
// DEPTH_FIRST("Depth-First Search"), TODO: implement
|
||||||
|
BREADTH_FIRST("Breadth-First Search"),
|
||||||
|
RANDOM("Random"),
|
||||||
|
;
|
||||||
|
|
||||||
|
public final String friendlyName;
|
||||||
|
|
||||||
|
private Strategy(String friendlyName) {
|
||||||
|
this.friendlyName = friendlyName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getFriendlyName() {
|
||||||
|
return friendlyName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
53
src/me/mrletsplay/scraperbot/Utils.java
Normal file
53
src/me/mrletsplay/scraperbot/Utils.java
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
package me.mrletsplay.scraperbot;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class Utils {
|
||||||
|
|
||||||
|
public static final int
|
||||||
|
SPLIT_SPACES = 0,
|
||||||
|
SPLIT_NEWLINES = 1,
|
||||||
|
SPLIT_ANYHWERE = 2;
|
||||||
|
|
||||||
|
public static List<String> splitToLength(String longString, int length, int splitPolicy) {
|
||||||
|
switch(splitPolicy) {
|
||||||
|
case SPLIT_ANYHWERE:
|
||||||
|
{
|
||||||
|
List<String> strings = new ArrayList<>();
|
||||||
|
while(longString.length() > 0) {
|
||||||
|
int subLen = Math.min(length, longString.length());
|
||||||
|
String subStr = longString.substring(0, subLen);
|
||||||
|
strings.add(subStr);
|
||||||
|
longString = longString.substring(subLen);
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings;
|
||||||
|
}
|
||||||
|
case SPLIT_SPACES:
|
||||||
|
case SPLIT_NEWLINES:
|
||||||
|
{
|
||||||
|
List<String> strings = new ArrayList<>();
|
||||||
|
while(longString.length() > 0) {
|
||||||
|
int splitAt = 0;
|
||||||
|
while(splitAt < longString.length()) {
|
||||||
|
int nextCharacter = longString.indexOf(splitPolicy == SPLIT_SPACES ? ' ' : '\n', splitAt + 1); // Look for the next possible position to split
|
||||||
|
if(nextCharacter == -1) nextCharacter = longString.length();
|
||||||
|
if(nextCharacter > length) break;
|
||||||
|
splitAt = nextCharacter;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(splitAt == 0) splitAt = length; // There is no way for us to split at the requested character. Just cut it to the max length
|
||||||
|
|
||||||
|
String subStr = longString.substring(0, splitAt);
|
||||||
|
strings.add(subStr);
|
||||||
|
longString = longString.substring(splitAt);
|
||||||
|
}
|
||||||
|
return strings;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Invalid split policy");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
234
src/me/mrletsplay/scraperbot/command/CommandScrape.java
Normal file
234
src/me/mrletsplay/scraperbot/command/CommandScrape.java
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
package me.mrletsplay.scraperbot.command;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import me.mrletsplay.scraperbot.FilterList;
|
||||||
|
import me.mrletsplay.scraperbot.ScraperBot;
|
||||||
|
import me.mrletsplay.scraperbot.ScraperTask;
|
||||||
|
import me.mrletsplay.scraperbot.ScraperTask.Strategy;
|
||||||
|
import net.dv8tion.jda.api.EmbedBuilder;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.OptionMapping;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.OptionType;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.OptionData;
|
||||||
|
import net.dv8tion.jda.api.utils.messages.MessageCreateBuilder;
|
||||||
|
|
||||||
|
public class CommandScrape extends SimpleParentCommand {
|
||||||
|
|
||||||
|
public CommandScrape() {
|
||||||
|
super("scrape", "Scrape a website");
|
||||||
|
|
||||||
|
OptionData strategy = new OptionData(OptionType.STRING, "strategy", "Scraping strategy", false);
|
||||||
|
for(Strategy s : Strategy.values()) {
|
||||||
|
strategy.addChoice(s.getFriendlyName(), s.name());
|
||||||
|
}
|
||||||
|
|
||||||
|
addChild("create", "Create a scraping task", event -> {
|
||||||
|
ScraperTask task = new ScraperTask(event.getOption("first_url").getAsString());
|
||||||
|
|
||||||
|
OptionMapping map = event.getOption("strategy");
|
||||||
|
if(map != null) {
|
||||||
|
task.setStrategy(Strategy.valueOf(map.getAsString()));
|
||||||
|
}
|
||||||
|
|
||||||
|
ScraperBot.TASKS.add(task);
|
||||||
|
|
||||||
|
event.reply(String.format("Task created: `%s`", task.getID())).queue();
|
||||||
|
},
|
||||||
|
new OptionData(OptionType.STRING, "first_url", "Initial URL to scrape", true),
|
||||||
|
strategy);
|
||||||
|
|
||||||
|
addChild("info", "Show info about a scraping task", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
EmbedBuilder b = new EmbedBuilder();
|
||||||
|
b.setTitle(task.getID());
|
||||||
|
|
||||||
|
b.addField("URL", task.getFirstURL(), true);
|
||||||
|
b.addField("Strategy", task.getStrategy().getFriendlyName(), true);
|
||||||
|
b.addField("Started", task.getStartTime() == 0 ? "No" : (System.currentTimeMillis() - task.getStartTime()) / (1000 * 60) + " minute(s) ago", true);
|
||||||
|
|
||||||
|
b.addField("Include: Exact", task.getInclude().getExact().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Include: Prefix", task.getInclude().getPrefix().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Include: Regex", task.getInclude().getRegex().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
|
||||||
|
b.addField("Exclude: Exact", task.getExclude().getExact().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Exclude: Prefix", task.getExclude().getPrefix().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Exclude: Regex", task.getExclude().getRegex().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
|
||||||
|
b.addField("Find: Exact", task.getFind().getExact().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Find: Prefix", task.getFind().getPrefix().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
b.addField("Find: Regex", task.getFind().getRegex().stream().collect(Collectors.joining("\n")), true);
|
||||||
|
|
||||||
|
event.reply(new MessageCreateBuilder().addEmbeds(b.build()).build()).queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
}, new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true));
|
||||||
|
|
||||||
|
addChild("start", "Start a scraping task", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
task.start(event.getChannel());
|
||||||
|
event.reply("Task started!").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
}, new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true));
|
||||||
|
|
||||||
|
addChild("cancel", "Cancel a scraping task", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
task.cancel();
|
||||||
|
event.reply("Task cancelled!").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
}, new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true));
|
||||||
|
|
||||||
|
addChild("cookie", "Yum, cookies!", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
task.setCookie(event.getOption("cookie").getAsString());
|
||||||
|
event.reply("Cookie set! :cookie: :yum:").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true),
|
||||||
|
new OptionData(OptionType.STRING, "cookie", "Cookie to set", true));
|
||||||
|
|
||||||
|
addChild("useragent", "Set the UA for requests", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
task.setUserAgent(event.getOption("user_agent").getAsString());
|
||||||
|
event.reply("UA set!").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true),
|
||||||
|
new OptionData(OptionType.STRING, "user_agent", "UA to set", true));
|
||||||
|
|
||||||
|
SimpleParentCommand cmd = new SimpleParentCommand("filter", "Manage filters");
|
||||||
|
addChild(cmd);
|
||||||
|
|
||||||
|
OptionData filter = new OptionData(OptionType.STRING, "filter", "Filter to update", true)
|
||||||
|
.addChoice("Include", "include")
|
||||||
|
.addChoice("Exclude", "exclude")
|
||||||
|
.addChoice("Find", "find");
|
||||||
|
OptionData type = new OptionData(OptionType.STRING, "type", "Type of filter", true)
|
||||||
|
.addChoice("Exact Match", "exact")
|
||||||
|
.addChoice("Has Prefix", "prefix")
|
||||||
|
.addChoice("Regex", "regex");
|
||||||
|
|
||||||
|
cmd.addChild("add", "Add a value to a filter", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String filterS = event.getOption("filter").getAsString();
|
||||||
|
String typeS = event.getOption("type").getAsString();
|
||||||
|
List<String> vals = getFilterVals(task, filterS, typeS);
|
||||||
|
vals.add(event.getOption("value").getAsString());
|
||||||
|
event.reply("Filter updated").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true),
|
||||||
|
filter,
|
||||||
|
type,
|
||||||
|
new OptionData(OptionType.STRING, "value", "Value", true));
|
||||||
|
|
||||||
|
cmd.addChild("remove", "Remove a value from a filter", event -> {
|
||||||
|
String id = event.getOption("id").getAsString();
|
||||||
|
ScraperTask task = ScraperBot.TASKS.stream().filter(t -> t.getID().equals(id)).findFirst().orElse(null);
|
||||||
|
if(task == null) {
|
||||||
|
event.reply("Task not found").queue();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String filterS = event.getOption("filter").getAsString();
|
||||||
|
String typeS = event.getOption("type").getAsString();
|
||||||
|
List<String> vals = getFilterVals(task, filterS, typeS);
|
||||||
|
vals.remove(event.getOption("value").getAsString());
|
||||||
|
event.reply("Filter updated").queue();
|
||||||
|
}, event -> {
|
||||||
|
if(event.getFocusedOption().getName().equals("id")) {
|
||||||
|
event.replyChoiceStrings(ScraperBot.TASKS.stream().map(t -> t.getID()).toList()).queue();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
new OptionData(OptionType.STRING, "id", "Task ID", true).setAutoComplete(true),
|
||||||
|
filter,
|
||||||
|
type,
|
||||||
|
new OptionData(OptionType.STRING, "value", "Value", true));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static List<String> getFilterVals(ScraperTask task, String filterS, String typeS) {
|
||||||
|
FilterList list;
|
||||||
|
switch(filterS) {
|
||||||
|
case "include":
|
||||||
|
default:
|
||||||
|
list = task.getInclude();
|
||||||
|
break;
|
||||||
|
case "exclude":
|
||||||
|
list = task.getExclude();
|
||||||
|
break;
|
||||||
|
case "find":
|
||||||
|
list = task.getFind();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<String> vals;
|
||||||
|
switch(typeS) {
|
||||||
|
case "exact":
|
||||||
|
default:
|
||||||
|
vals = list.getExact();
|
||||||
|
break;
|
||||||
|
case "prefix":
|
||||||
|
vals = list.getPrefix();
|
||||||
|
break;
|
||||||
|
case "regex":
|
||||||
|
vals = list.getRegex();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return vals;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
71
src/me/mrletsplay/scraperbot/command/SimpleCommand.java
Normal file
71
src/me/mrletsplay/scraperbot/command/SimpleCommand.java
Normal file
@ -0,0 +1,71 @@
|
|||||||
|
package me.mrletsplay.scraperbot.command;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
|
import net.dv8tion.jda.api.events.interaction.command.CommandAutoCompleteInteractionEvent;
|
||||||
|
import net.dv8tion.jda.api.events.interaction.command.SlashCommandInteractionEvent;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.OptionData;
|
||||||
|
|
||||||
|
public abstract class SimpleCommand {
|
||||||
|
|
||||||
|
private String name, description;
|
||||||
|
private List<SimpleCommand> children;
|
||||||
|
|
||||||
|
public SimpleCommand(String name, String description) {
|
||||||
|
this.name = name;
|
||||||
|
this.description = description;
|
||||||
|
this.children = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getDescription() {
|
||||||
|
return description;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addChild(SimpleCommand cmd) {
|
||||||
|
children.add(cmd);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addChild(String name, String description, Consumer<SlashCommandInteractionEvent> action, Consumer<CommandAutoCompleteInteractionEvent> complete, OptionData... options) {
|
||||||
|
List<OptionData> opts = Arrays.asList(options);
|
||||||
|
addChild(new SimpleCommand(name, description) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void action(SlashCommandInteractionEvent event) {
|
||||||
|
action.accept(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void complete(CommandAutoCompleteInteractionEvent event) {
|
||||||
|
if(complete != null) complete.accept(event);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<OptionData> getOptions() {
|
||||||
|
return opts;
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addChild(String name, String description, Consumer<SlashCommandInteractionEvent> action, OptionData... options) {
|
||||||
|
addChild(name, description, action, null, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<SimpleCommand> getChildren() {
|
||||||
|
return children;
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract void action(SlashCommandInteractionEvent event);
|
||||||
|
|
||||||
|
public void complete(CommandAutoCompleteInteractionEvent event) {}
|
||||||
|
|
||||||
|
public abstract List<OptionData> getOptions();
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package me.mrletsplay.scraperbot.command;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import net.dv8tion.jda.api.events.interaction.command.SlashCommandInteractionEvent;
|
||||||
|
import net.dv8tion.jda.api.interactions.commands.build.OptionData;
|
||||||
|
|
||||||
|
public class SimpleParentCommand extends SimpleCommand {
|
||||||
|
|
||||||
|
public SimpleParentCommand(String name, String description) {
|
||||||
|
super(name, description);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void action(SlashCommandInteractionEvent event) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<OptionData> getOptions() {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user