Disclaimer: This framework is to help developers simplify the development process and improve development efficiency. Please do not use this framework to do anything that violates national laws. Anything the user does has nothing to do with the author of this framework.
LucasSpider, a .NET Standard web crawling library. It is a lightweight, efficient, and fast high-level web crawling & scraping framework.
If you want to get the latest beta packages, you should add the myget feed:
<add key="myget.org" value="https://www.myget.org/F/zlzforever/api/v3/index.json" protocolVersion="3" />
-
Visual Studio 2017 (15.3 or later) or Jetbrains Rider
-
Docker
-
MySql
docker run --name mysql -d -p 3306:3306 --restart always -e MYSQL_ROOT_PASSWORD=1qazZAQ! mysql:5.7
-
Redis (option)
docker run --name redis -d -p 6379:6379 --restart always redis
-
SqlServer
docker run --name sqlserver -d -p 1433:1433 --restart always -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=1qazZAQ!' mcr.microsoft.com/mssql/server:2017-latest
-
PostgreSQL (option)
docker run --name postgres -d -p 5432:5432 --restart always -e POSTGRES_PASSWORD=1qazZAQ! postgres
-
MongoDb (option)
docker run --name mongo -d -p 27017:27017 --restart always mongo
-
RabbitMQ
docker run -d --restart always --name rabbimq -p 4369:4369 -p 5671-5672:5671-5672 -p 25672:25672 -p 15671-15672:15671-15672 \ -e RABBITMQ_DEFAULT_USER=user -e RABBITMQ_DEFAULT_PASS=password \ rabbitmq:3-management
-
Docker remote api for mac
docker run -d --restart always --name socat -v /var/run/docker.sock:/var/run/docker.sock -p 2376:2375 bobrik/socat TCP4-LISTEN:2375,fork,reuseaddr UNIX-CONNECT:/var/run/docker.sock
-
HBase
docker run -d --restart always --name hbase -p 20550:8080 -p 8085:8085 -p 9090:9090 -p 9095:9095 -p 16010:16010 dajobe/hbase
https://github.com/dotnetcore/LucasSpider/wiki
Please see the Project LucasSpider.Sample in the solution.
public class EntitySpider : Spider
{
public EntitySpider(IOptions<SpiderOptions> options, SpiderServices services, ILogger<Spider> logger) : base(
options, services, logger)
{
}
#region Nested type: CnblogsEntry
[Schema("cnblogs", "news")]
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
[GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
[FollowRequestSelector(XPaths = new[]
{
"//div[@class='pager']"
})]
public class CnblogsEntry : EntityBase<CnblogsEntry>
{
public int Id { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "类别", Type = SelectorType.Environment)]
public string Category { get; set; }
[Required]
[StringLength(200)]
[ValueSelector(Expression = "网站", Type = SelectorType.Environment)]
public string WebSite { get; set; }
[StringLength(200)]
[ValueSelector(Expression = "//title")]
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
public string Title { get; set; }
[StringLength(40)]
[ValueSelector(Expression = "GUID", Type = SelectorType.Environment)]
public string Guid { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
public string News { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
public string Url { get; set; }
[ValueSelector(Expression = ".//div[@class='entry_summary']")]
public string PlainText { get; set; }
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)]
public DateTime CreationTime { get; set; }
protected override void Configure()
{
HasIndex(x => x.Title);
HasIndex(x => new
{
x.WebSite,
x.Guid
}, true);
}
}
#endregion
public static async Task RunAsync()
{
var builder = Builder.CreateDefaultBuilder<EntitySpider>();
builder.UseSerilog();
builder.UseQueueDistinctBfsScheduler<HashSetDuplicateRemover>();
await builder.Build()
.RunAsync();
}
protected override async Task InitializeAsync(CancellationToken stoppingToken)
{
AddDataFlow(new DataParser<CnblogsEntry>());
AddDataFlow(GetDefaultStorage());
await AddRequestsAsync(new Request("https://news.cnblogs.com/n/page/1/", new Dictionary<string, string>
{
{
"网站", "博客园"
}
}), new Request("https://news.cnblogs.com/n/page/2/", new Dictionary<string, string>
{
{
"网站", "博客园"
}
}));
}
protected override (string Id, string Name) GetIdAndName()
{
return (ObjectId.NewId.ToString(), "博客园");
}
}
Coming soon
timeout 0
tcp-keepalive 60
Package | License |
---|---|
Bert.RateLimiters | Apache 2.0 |
MessagePack | MIT |
Newtonsoft.Json | MIT |
Dapper | Apache 2.0 |
HtmlAgilityPack | MIT |
ZCJ.HashedWheelTimer | MIT |
murmurhash | Apache 2.0 |
Serilog.AspNetCore | Apache 2.0 |
Serilog.Sinks.Console | Apache 2.0 |
Serilog.Sinks.RollingFile | Apache 2.0 |
Serilog.Sinks.PeriodicBatching | Apache 2.0 |
MongoDB.Driver | Apache 2.0 |
MySqlConnector | MIT |
AutoMapper.Extensions.Microsoft.DependencyInjection | MIT |
Docker.DotNet | MIT |
BuildBundlerMinifier | Apache 2.0 |
Pomelo.EntityFrameworkCore.MySql | MIT |
Quartz.AspNetCore | Apache 2.0 |
Quartz.AspNetCore.MySqlConnector | Apache 2.0 |
Npgsql | PostgreSQL License |
RabbitMQ.Client | Apache 2.0 |
Polly | BSD 3-C |
QQ Group: 477731655 Email: zlzforever@163.com