Here is the source code. Make sure that you have all the files under the same project. Finally you have to run the main.py file. Make sure that you type in the proper URL for the HOMEPAGE variable in main.py file.
Here is the source code for each and every file:
demo.py
import os
def create_project_dir(directory):
if not os.path.exists(directory):
print('Creating the directory' + directory)
os.makedirs(directory)
def create_data_files(project_name, base_url):
queue = os.path.join(project_name,'queue.txt')
crawled = os.path.join(project_name,"crawled.txt")
if not os.path.isfile(queue):
write_file(queue,base_url)
if not os.path.isfile(crawled):
write_file(crawled,'')
def write_file(path,data):
with open(path,'w') as f:
f.write(data)
def append_to_file(path,data):
with open(path,'a') as f:
f.write(data,'\n')
def delete_file_contents(path):
open(path,'w').close()
def file_to_set(file_name):
results= set()
with open(file_name,'rt') as f:
for line in f:
results.add(line.replace('\n',''))
return results
def set_to_file(links,file_name):
with open(file_name,"w") as f:
for l in sorted(links):
f.write(l+"\n")
domain.py
from urllib.parse import urlparse
def get_domain_name(url):
try:
results = get_sub_domain_name(url).split('.')
return results[-2] + '.' + results[-1]
except:
return ''
def get_sub_domain_name(url):
try:
return urlparse(url).netloc
except:
return ''
spider.py
from urllib.request import urlopen
from link_finder import LinkFinder
from demo import *
from domain import *
class Spider:
project_name = ''
base_url = ''
domain_name = ''
queue_file = ''
crawled_file = ''
queue = set()
crawled = set()
def __init__(self, project_name, base_url, domain_name):
Spider.project_name = project_name
Spider.base_url = base_url
Spider.domain_name = domain_name
Spider.queue_file = Spider.project_name + '/queue.txt'
Spider.crawled_file = Spider.project_name + '/crawled.txt'
self.boot()
self.crawl_page('First spider',Spider.base_url)
@staticmethod
def boot():
create_project_dir(Spider.project_name)
create_data_files(Spider.project_name,Spider.base_url)
Spider.queue = file_to_set(Spider.queue_file)
Spider.crawled = file_to_set(Spider.crawled_file)
@staticmethod
def crawl_page(thread_name,page_url):
if page_url not in Spider.crawled:
print(thread_name + 'Now crawling ' +page_url)
print('Queue' + str(len(Spider.queue)) + ' | Crawled ' +str(len(Spider.crawled)))
Spider.add_links_to_queue(Spider.gather_links(page_url))
Spider.queue.remove(page_url)
Spider.crawled.add(page_url)
Spider.update_files()
@staticmethod
def gather_links(page_url):
html_string = ''
try:
response = urlopen(page_url)
if 'text/html' in response.getheader('Content-Type'):
html_bytes = response.read()
html_string = html_bytes.decode("utf-8")
finder = LinkFinder(Spider.base_url,page_url)
finder.feed(html_string)
except Exception as e:
print(str(e))
return set()
return finder.page_links()
@staticmethod
def add_links_to_queue(links):
for url in links:
if (url in Spider.queue) or (url in Spider.crawled):
continue
if Spider.domain_name != get_domain_name(url):
continue
Spider.queue.add(url)
@staticmethod
def update_files():
set_to_file(Spider.queue,Spider.queue_file)
set_to_file(Spider.crawled,Spider.crawled_file)
main.py
import threading
from queue import Queue
from spider import Spider
from domain import *
from demo import *
PROJECT_NAME ='thesite'
HOMEPAGE = 'your url goes here'
DOMAIN_NAME = get_domain_name(HOMEPAGE)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 8
queue= Queue()
Spider(PROJECT_NAME,HOMEPAGE,DOMAIN_NAME)
def crawl():
queued_links= file_to_set(QUEUE_FILE)
if len(queued_links) > 0:
print(str(len(queued_links))+' Links in the queue ')
create_jobs()
def create_jobs():
for link in file_to_set(QUEUE_FILE):
queue.put(link)
queue.join()
crawl()
def create_workers():
for _ in range(NUMBER_OF_THREADS):
t = threading.Thread(target=work)
t.daemon= True
t.start()
def work():
while True:
url = queue.get()
Spider.crawl_page(threading.current_thread().name, url)
queue.task_done()
create_workers()
crawl()
link_finder.py
from html.parser import HTMLParser
from urllib import parse
class LinkFinder(HTMLParser):
def __init__(self,base_url,page_url):
super().__init__()
self.base_url = base_url
self.page_url = page_url
self.links = set()
def error(self, message):
pass
def handle_starttag(self, tag, attrs):
if tag == 'a':
for (attribute,value) in attrs:
if attribute == 'href':
url = parse.urljoin(self.base_url,value)
self.links.add(url)
def page_links(self):
return self.links