Update UA, Fix scraping
This commit is contained in:
parent
3f4ef49bc3
commit
bca3304603
2
pom.xml
2
pom.xml
@ -12,7 +12,7 @@
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.8.1</version>
|
||||
<configuration>
|
||||
<release>20</release>
|
||||
<release>17</release>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
@ -36,7 +36,7 @@ public class ScraperTask {
|
||||
|
||||
private String
|
||||
cookie, // For CF bypass, login etc.
|
||||
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0";
|
||||
userAgent = "Mozilla/5.0 (X11; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0";
|
||||
|
||||
private FilterList exclude;
|
||||
private FilterList include;
|
||||
@ -131,7 +131,7 @@ public class ScraperTask {
|
||||
break;
|
||||
}
|
||||
|
||||
System.out.println(toVisit.size() + " " + url); // TODO: show in info command
|
||||
System.out.println("Visiting " + url + " (" + toVisit.size() + " left)"); // TODO: show in info command
|
||||
toVisit.remove(url);
|
||||
if(!visited.add(url)) {
|
||||
System.out.println("Deja vu, I've been in this place before");
|
||||
@ -141,7 +141,7 @@ public class ScraperTask {
|
||||
Set<String> links = getLinks(url);
|
||||
if(Thread.currentThread().isInterrupted()) break;
|
||||
links.removeAll(visited);
|
||||
links.stream().filter(l -> include.matches(url) && !exclude.matches(url)).forEach(toVisit::add);
|
||||
links.stream().filter(l -> include.matches(l) && !exclude.matches(l)).forEach(toVisit::add);
|
||||
|
||||
Set<String> newFound = links.stream().filter(l -> find.matches(l)).collect(Collectors.toSet());
|
||||
System.out.println("FOUND: " + newFound); // TODO: show in info command
|
||||
@ -169,7 +169,8 @@ public class ScraperTask {
|
||||
}
|
||||
|
||||
private static String clean(String uri) throws URISyntaxException {
|
||||
URI u = URI.create(uri);
|
||||
// System.out.println(uri);
|
||||
URI u = URI.create(uri.replace(" ", "%20"));
|
||||
return new URI(u.getScheme(), u.getSchemeSpecificPart(), null).toString();
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user