Update UA, Fix scraping

This commit is contained in:
MrLetsplay 2024-03-28 22:40:38 +01:00
parent 3f4ef49bc3
commit bca3304603
Signed by: mr
SSH Key Fingerprint: SHA256:92jBH80vpXyaZHjaIl47pjRq+Yt7XGTArqQg1V7hSqg
2 changed files with 6 additions and 5 deletions

View File

@ -12,7 +12,7 @@
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.1</version>
<configuration>
<release>20</release>
<release>17</release>
</configuration>
</plugin>
</plugins>

View File

@ -36,7 +36,7 @@ public class ScraperTask {
private String
cookie, // For CF bypass, login etc.
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0";
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0";
private FilterList exclude;
private FilterList include;
@ -131,7 +131,7 @@ public class ScraperTask {
break;
}
System.out.println(toVisit.size() + " " + url); // TODO: show in info command
System.out.println("Visiting " + url + " (" + toVisit.size() + " left)"); // TODO: show in info command
toVisit.remove(url);
if(!visited.add(url)) {
System.out.println("Deja vu, I've been in this place before");
@ -141,7 +141,7 @@ public class ScraperTask {
Set<String> links = getLinks(url);
if(Thread.currentThread().isInterrupted()) break;
links.removeAll(visited);
links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add);
links.stream().filter(l -> include.matches(l) && !exclude.matches(l)).forEach(toVisit::add);
Set<String> newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet());
System.out.println("FOUND: " + newFound); // TODO: show in info command
@ -169,7 +169,8 @@ public class ScraperTask {
}
private static String clean(String uri) throws URISyntaxException {
URI u = URI.create(uri);
// System.out.println(uri);
URI u = URI.create(uri.replace(" ", "%20"));
return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString();
}