DotnetSpider, a .NET Standard web crawling library. It is lightweight, efficient and fast high-level web crawling & scraping framework
+--------------------+ +---------------------+
| Download Center | | Statistics Center |
+--------------------+ +----------^---------+ +----------^----------+
| Downloader Agent 1 +----+ | |
+--------------------+ | | |
| +----------v-------Message Queue---v----------+ +------------- Scheduler-----------------+
+--------------------+ | | +-------+ +----------+ +-------+ | | +-------+ +-------+ +----------+ |
| Downloader Agent 2 +----+<----> | Local | | RabbitMq | | Kafka | | | | Local | | Redis | | Database | |
+--------------------+ | | +-------+ +----------+ +-------+ | | +-------+ +-------+ +----------+ |
| +-----------------------^---------------------+ +-------------------^--------------------+
+--------------------+ | | |
| Downloader Agent 3 +----+ | |
+--------------------+ +-------Spider----------v---------------------+ |
| +-----------------+ +--------------------+ | |
| | SpeedController | | RequestSupply | | |
| +-----------------+ +--------------------+ <-----------------------+
| +----------------------------+ +----------+ | |
| | Configure Request delegate | | DataFlow | | |
| +----------------------------+ +----------+ | |
+---------------------------------------------+ +-----------v--------------+
| MySql, SqlServer, etc |
| ClickHouse |
- Visual Studio 2017 (15.3 or later)
- .NET Core 2.2 or later
$ sudo docker run --name mysql -d -p 3306:3306 --restart always -e MYSQL_ROOT_PASSWORD=1qazZAQ! mysql:5.7
sudo docker run --name redis -d -p 6379:6379 --restart always redis
sudo docker run --name sqlserver -d -p 1433:1433 --restart always -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=1qazZAQ!'
sudo docker run --name postgres -d -p 5432:5432 --restart always -e POSTGRES_PASSWORD=1qazZAQ! postgres
sudo docker run --name mongo -d -p 27017:27017 --restart always mongo
$ sudo docker run --name kafka -d -p 9092:9092 --restart always --net bridge -h kafka --env ADVERTISED_PORT=9092 spotify/kafka $ sudo -s bash-3.2# echo " kafka" >> /etc/hosts
Docker remote api for mac
$ docker run -d -v /var/run/docker.sock:/var/run/docker.sock -p 2376:2375 \ bobrik/socat TCP4-LISTEN:2375,fork,reuseaddr UNIX-CONNECT:/var/run/docker.sock
Please see the Projet DotnetSpider.Sample in the solution.
public class EntitySpider : Spider
public static void Run()
var builder = new SpiderBuilder();
var provider = builder.Build();
protected override void Initialize()
Scheduler = new QueueDistinctBfsScheduler();
Speed = 1;
Depth = 3;
DownloaderSettings.Type = DownloaderType.HttpClient;
AddDataFlow(new DataParser<BaiduSearchEntry>()).AddDataFlow(GetDefaultStorage());
new Request("", new Dictionary<string, string> {{"网站", "博客园"}}),
new Request("", new Dictionary<string, string> {{"网站", "博客园"}}));
[Schema("cnblogs", "cnblogs_entity_model")]
[EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)]
[ValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)]
class BaiduSearchEntry : EntityBase<BaiduSearchEntry>
protected override void Configure()
HasIndex(x => x.Title);
HasIndex(x => new {x.WebSite, x.Guid}, true);
public int Id { get; set; }
[ValueSelector(Expression = "类别", Type = SelectorType.Enviroment)]
public string Category { get; set; }
[ValueSelector(Expression = "网站", Type = SelectorType.Enviroment)]
public string WebSite { get; set; }
[ValueSelector(Expression = "//title")]
[ReplaceFormatter(NewValue = "", OldValue = " - 博客园")]
public string Title { get; set; }
[ValueSelector(Expression = "GUID", Type = SelectorType.Enviroment)]
public string Guid { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a")]
public string News { get; set; }
[ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")]
public string Url { get; set; }
[ValueSelector(Expression = ".//div[@class='entry_summary']", ValueOption = ValueOption.InnerText)]
public string PlainText { get; set; }
[ValueSelector(Expression = "DATETIME", Type = SelectorType.Enviroment)]
public DateTime CreationTime { get; set; }
public EntitySpider(IMessageQueue mq, IStatisticsService statisticsService, ISpiderOptions options, ILogger<Spider> logger, IServiceProvider services) : base(mq, statisticsService, options, logger, services)
+ MySql
+ Kafka
1. start DotnetSpider.DownloadCenter
2. start Downloaderer.DownloaderAgent
3. run DotnetSpider.Sample/samples/DistributedSpider.Run
Command: -s [spider type name] -i [id] -a [arg1,arg2...] -d [true/false] -n [name] -c [configuration file]
1. -s: Type name of spider for example: EntitySpider
2. -i: Set spider id
3. -a: Pass arguments to spider's Run method
4. -n: Set spider name
5. -c: Set config file path, for example you want to run with a customize config: -c
When you want to collect a page JS loaded, there is only one thing to do, set the downloader to WebDriverDownloader.
Downloader = new WebDriverDownloader(Browser.Chrome);
- Make sure the ChromeDriver.exe is in bin folder when use Chrome, install it to your project from NUGET: Chromium.ChromeDriver
- Make sure you already add a *.webdriver Firefox profile when use Firefox:
- Make sure the PhantomJS.exe is in bin folder when use PhantomJS, install it to your project from NUGET: PhantomJS
timeout 0
tcp-keepalive 60
QQ Group: 477731655 Email: [email protected]