From 40dd5a8861ff89059b1413130c678b7a9345d203 Mon Sep 17 00:00:00 2001 From: ian Date: Mon, 7 Oct 2019 11:34:55 +0100 Subject: [PATCH 1/2] gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 26cda90..64621b3 100644 --- a/.gitignore +++ b/.gitignore @@ -588,3 +588,6 @@ /venv/lib/python3.7/site-packages/pymongo/uri_parser.py /venv/lib/python3.7/site-packages/pymongo-3.9.0.dist-info/WHEEL /venv/lib/python3.7/site-packages/pymongo/write_concern.py +/venv/ +/.idea/ +/.env From d7707bbaccfe1a2116f960bef24ce0a863b274d1 Mon Sep 17 00:00:00 2001 From: ian Date: Mon, 21 Oct 2019 01:21:52 +0300 Subject: [PATCH 2/2] Data_mining course project --- README.md | 33 +++- find_connections_vk/__init__.py | 0 find_connections_vk/items.py | 14 ++ find_connections_vk/middlewares.py | 131 ++++++++++++++++ find_connections_vk/pipelines.py | 19 +++ find_connections_vk/settings.py | 101 ++++++++++++ find_connections_vk/spiders/__init__.py | 4 + find_connections_vk/spiders/vk_connections.py | 144 ++++++++++++++++++ runner.py | 21 +++ scrapy.cfg | 11 ++ 10 files changed, 477 insertions(+), 1 deletion(-) create mode 100644 find_connections_vk/__init__.py create mode 100644 find_connections_vk/items.py create mode 100644 find_connections_vk/middlewares.py create mode 100644 find_connections_vk/pipelines.py create mode 100644 find_connections_vk/settings.py create mode 100644 find_connections_vk/spiders/__init__.py create mode 100644 find_connections_vk/spiders/vk_connections.py create mode 100644 runner.py create mode 100644 scrapy.cfg diff --git a/README.md b/README.md index d83bb30..c0b3d7e 100644 --- a/README.md +++ b/README.md @@ -1 +1,32 @@ -# Data_Mining \ No newline at end of file +# Data_Mining + +В файле runner.py на вход подаются два айдишника юзеров вк. +Алгоритм использует т.н. Breadth-first search вместо стандартного Depth-first search. + +На первом шаге идет проверка, не состоят ли переданные айдишники в друзьях, +на втором: нет ли у переданных айдишников общих друзей, +на третьем и последующих: - идет запрос списка друзей юзера_1. + - Список сплитится на списки по 100 юзеров (ограничение апи вк) + - проверяется нет ли общих друзей у юзера_2 с переданным списком из 100 айдишников + +На выходе в монго записывается первая найденная цепочка друзей. + +Интуитивно, мне показался этот способ быстрее, чем прямой перебор друзей, хочу потестить так ли это, но до сдачи уже не успею + +Не успел написать приятный аутпут процессор, поэтому расшифрую результат на конкретном примере одного айтема: + +{'_id': ObjectId('5dacd5ec6967cc57cb1daafa'), + 'friends_chain': [19587588, <- юзер_1 (передан на входе) + 81771, <- друг_1 в цепочке + [{'common_count': 1, + 'common_friends': [178913514], <- список общих друзей друга_2 и юзера_2 + 'id': 11081576}, <- друг_2 в цепочке + {'common_count': 1, + 'common_friends': [152465], <- список общих друзей другого_друга_2 и юзера_2 + 'id': 11264606}], <- другой_друг_2 в цепочке + 20367747]} <- юзер_2 (передан на входе) + +Фактически, выше представлены 2 цепочки, отличающиеся 3 и 4 айдишниками: +[19587588, 81771, 11081576, 178913514, 20367747] +и +[19587588, 81771, 11264606, 152465, 20367747] \ No newline at end of file diff --git a/find_connections_vk/__init__.py b/find_connections_vk/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/find_connections_vk/items.py b/find_connections_vk/items.py new file mode 100644 index 0000000..2fe8689 --- /dev/null +++ b/find_connections_vk/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy +from scrapy.loader.processors import MapCompose, TakeFirst + + +class FindConnectionsVkItem(scrapy.Item): + _id = scrapy.Field() + friends_chain = scrapy.Field() diff --git a/find_connections_vk/middlewares.py b/find_connections_vk/middlewares.py new file mode 100644 index 0000000..ec05ccf --- /dev/null +++ b/find_connections_vk/middlewares.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals +from scrapy.downloadermiddlewares.retry import RetryMiddleware +from scrapy.utils.response import response_status_message +import time +import json + + +class TooManyRequestsRetryMiddleware(RetryMiddleware): + + def __init__(self, crawler): + super(TooManyRequestsRetryMiddleware, self).__init__(crawler.settings) + self.crawler = crawler + + @classmethod + def from_crawler(cls, crawler): + return cls(crawler) + + def process_response(self, request, response, spider): + if request.meta.get('dont_retry', False): + return response + elif json.loads(response.body).get('error'): + self.crawler.engine.pause() + time.sleep(1) # If the rate limit is renewed in a minute, put 60 seconds, and so on. + self.crawler.engine.unpause() + reason = response_status_message(response.status) + return self._retry(request, reason, spider) or response + elif response.status in self.retry_http_codes: + reason = response_status_message(response.status) + return self._retry(request, reason, spider) or response + return response + +class FindConnectionsVkSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class FindConnectionsVkDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/find_connections_vk/pipelines.py b/find_connections_vk/pipelines.py new file mode 100644 index 0000000..c980600 --- /dev/null +++ b/find_connections_vk/pipelines.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +from pymongo import MongoClient + + +class FindConnectionsVkPipeline(object): + def __init__(self): + client = MongoClient('localhost', 27017) + self.mongo_base = client.vk_connections + + def process_item(self, item, spider): + mongo_coll_name = f"{spider.name} for {item.get('friends_chain')[0]} and {item.get('friends_chain')[-1]}" + collection = self.mongo_base[mongo_coll_name] + collection.insert_one(item) + return item diff --git a/find_connections_vk/settings.py b/find_connections_vk/settings.py new file mode 100644 index 0000000..c7ffc1d --- /dev/null +++ b/find_connections_vk/settings.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for find_connections_vk project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'find_connections_vk' + +SPIDER_MODULES = ['find_connections_vk.spiders'] +NEWSPIDER_MODULE = 'find_connections_vk.spiders' + +LOG_ENABLED = True +LOG_LEVEL = 'DEBUG' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'find_connections_vk (+http://www.yourdomain.com)' +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 " \ + "(KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36 " + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 25 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 0.2 +# The download delay setting will honor only one of: +CONCURRENT_REQUESTS_PER_DOMAIN = 25 +CONCURRENT_REQUESTS_PER_IP = 25 + +DEPTH_PRIORITY = 1 +SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' +SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# 'find_connections_vk.middlewares.FindConnectionsVkSpiderMiddleware': 543, +# } +URLLENGTH_LIMIT = 80000 + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'find_connections_vk.middlewares.TooManyRequestsRetryMiddleware': 543, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'find_connections_vk.pipelines.FindConnectionsVkPipeline': 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/find_connections_vk/spiders/__init__.py b/find_connections_vk/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/find_connections_vk/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/find_connections_vk/spiders/vk_connections.py b/find_connections_vk/spiders/vk_connections.py new file mode 100644 index 0000000..f791e02 --- /dev/null +++ b/find_connections_vk/spiders/vk_connections.py @@ -0,0 +1,144 @@ +# -*- coding: utf-8 -*- +import scrapy +import json +import copy +from scrapy.http import Request +from scrapy.loader import ItemLoader +from find_connections_vk.items import FindConnectionsVkItem +from scrapy.exceptions import CloseSpider + + +class VkConnectionsSpider(scrapy.Spider): + name = 'vk_connections' + allowed_domains = ['vk.com'] + start_urls = ['https://vk.com'] + api_url = 'https://api.vk.com/method/' + + def __init__(self, uid_1, uid_2, user_login, token, *args, **kwargs): + self.uid_1 = int(uid_1) + self.uid_2 = int(uid_2) + self.user_login = user_login + self.access_token = token + self.friends_chain = [self.uid_1, self.uid_2] + super().__init__(*args, *kwargs) + + def start_requests(self): + params = {'user_id': self.uid_1} + yield Request(url=self.make_vk_api_url('friends.get', params), callback=self.first_check_if_friends) + yield Request(url=self.make_vk_api_url('friends.get', params), callback=self.parse_list_of_friends, + meta={'curr_friends_chain': self.friends_chain}, + dont_filter=True) + + def first_check_if_friends(self, response): + friends_list = json.loads(response.body).get('response').get('items') + + if self.uid_2 in friends_list: + loader = self.fill_item(self.friends_chain) + yield loader.load_item() + + else: + params = {'source_uid': self.uid_1, 'target_uids': self.uid_2} + yield Request(url=self.make_vk_api_url('friends.getMutual', params), + callback=self.parse, + meta={'friends_list': [self.uid_1], + 'curr_friends_chain': self.friends_chain, + 'first_request': True}) + + def parse(self, response): + friends_chain = response.meta['curr_friends_chain'] + response_json = json.loads(response.body).get('response') + mutual_friends = [friend for friend in response_json if friend.get('common_friends')] + + if mutual_friends: + friends_chain.insert(-1, mutual_friends) + loader = self.fill_item(friends_chain) + yield loader.load_item() + raise CloseSpider('chain was found') + + elif response.meta['first_request']: + pass + + else: + for friend in response.meta['friends_list']: + curr_friends_chain = copy.deepcopy(friends_chain) + curr_friends_chain.insert(-1, friend) + params = {'user_id': friend} + yield Request(self.make_vk_api_url('friends.get', params), + callback=self.parse_list_of_friends, + meta={'curr_friends_chain': curr_friends_chain}) + + def parse_list_of_friends(self, response): + list_of_friends = json.loads(response.body).get('response').get('items') + + if len(list_of_friends) > 500: + split_list = self.split(list_of_friends, 500) + + else: + split_list = [list_of_friends] + + for list_ in split_list: + params = {'user_ids': ','.join(map(str, list_))} + yield Request(self.make_vk_api_url('users.get', params), + callback=self.clear_list_of_friends, + errback=self.split_users_more, + meta={'curr_friends_chain': response.meta['curr_friends_chain'], + 'list_of_friends': list_}) + + def clear_list_of_friends(self, response): + """Чистит лист друзей от закрытых и забаненных профилей, иначе friends.getMutual выдает + ошибку при наличии хотя бы одного такого профиля.""" + dirty_curr_list = json.loads(response.body).get('response') + + if len(dirty_curr_list) > 100: + split_list = self.split(dirty_curr_list, 100) + else: + split_list = [dirty_curr_list] + + for list_ in split_list: + clear_curr_list = [itm.get('id') for itm in list_ + if not itm.get('is_closed') and not itm.get('deactivated')] + + params = {'source_uid': self.uid_2, 'target_uids': ','.join(map(str, clear_curr_list))} + yield Request(self.make_vk_api_url('friends.getMutual', params), + callback=self.parse, + meta={'friends_list': clear_curr_list, + 'curr_friends_chain': response.meta['curr_friends_chain'], + 'first_request': False}) + + def split_users_more(self, failure): + """Иногда проскакивают 414 ошибки из-за слишком длинного url в запросе, + поэтому сокращаем количество юзеров в запросе users.get""" + split_list = self.split(failure.request.meta['list_of_friends'], + len(failure.request.meta['list_of_friends']) // 2) + + for list_ in split_list: + params = {'user_ids': ','.join(map(str, list_))} + yield Request(self.make_vk_api_url('users.get', params), + callback=self.clear_list_of_friends, + errback=self.split_users_more, + meta={'curr_friends_chain': failure.request.meta['curr_friends_chain'], + 'list_of_friends': list_}) + + def make_vk_api_url(self, method, params): + """Возвращает `url` для `api` запроса""" + params_string = '&'.join([f"{k}={v}" for k, v in params.items()]) + result = f'{self.api_url}{method}?{params_string}&access_token={self.access_token}&v=5.89' + return result + + @staticmethod + def fill_item(friends_chain): + loader = ItemLoader(item=FindConnectionsVkItem()) + loader.add_value('friends_chain', friends_chain) + return loader + + @staticmethod + def split(arr, size): + result_arr = [] + while len(arr) > size: + pice = arr[:size] + result_arr.append(pice) + arr = arr[size:] + result_arr.append(arr) + return result_arr + + diff --git a/runner.py b/runner.py new file mode 100644 index 0000000..627f462 --- /dev/null +++ b/runner.py @@ -0,0 +1,21 @@ +import os +from os.path import join, dirname +from dotenv import load_dotenv +from scrapy.crawler import CrawlerProcess +from scrapy.settings import Settings + +from find_connections_vk import settings +from find_connections_vk.spiders.vk_connections import VkConnectionsSpider + +do_env = join(dirname(__file__), '.env') +load_dotenv(do_env) + +USER_ID = os.getenv('USER_ID') +TOKEN = os.getenv('TOKEN') + +if __name__ == '__main__': + crawler_settings = Settings() + crawler_settings.setmodule(settings) + process = CrawlerProcess(settings=crawler_settings) + process.crawl(VkConnectionsSpider, '19587588', '20367747', USER_ID, TOKEN) + process.start() diff --git a/scrapy.cfg b/scrapy.cfg new file mode 100644 index 0000000..f38a083 --- /dev/null +++ b/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = find_connections_vk.settings + +[deploy] +#url = http://localhost:6800/ +project = find_connections_vk