Skip to content

Commit

Permalink
Merge pull request #15 from wuyue92tree/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
wuyue92tree authored Sep 22, 2017
2 parents ddb4771 + 0bf78e4 commit c3e17c9
Show file tree
Hide file tree
Showing 22 changed files with 150 additions and 111 deletions.
16 changes: 11 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ Crwy
.. image:: https://img.shields.io/pypi/v/Crwy.svg
:target: https://pypi.python.org/pypi/Crwy
:alt: PyPI Version
.. image:: https://travis-ci.org/wuyue92tree/crwy.svg?branch=1.0.5
.. image:: https://travis-ci.org/wuyue92tree/crwy.svg?branch=1.0.6
:target: https://travis-ci.org/wuyue92tree/crwy
:alt: Build Status
.. image:: https://readthedocs.org/projects/crwy/badge/?version=1.0.5
:target: http://crwy.readthedocs.io/zh_CN/1.0.5/?badge=1.0.5
.. image:: https://readthedocs.org/projects/crwy/badge/?version=1.0.6
:target: http://crwy.readthedocs.io/zh_CN/1.0.6/?badge=1.0.6
:alt: Documentation Status

简介
Expand Down Expand Up @@ -38,11 +38,11 @@ Crwy是一个轻量级的爬虫抓取框架,参考Scrapy框架结构开发而
pip install crwy

or
前往下载: https://pypi.python.org/pypi/Crwy/1.0.5/
前往下载: https://pypi.python.org/pypi/Crwy/1.0.6/

使用手册
===================
在这里: http://crwy.readthedocs.io/zh_CN/1.0.5/
在这里: http://crwy.readthedocs.io/zh_CN/1.0.6/

友情链接
===================
Expand All @@ -55,6 +55,12 @@ or
修改日志
===================

2017-09-21 v1.0.6

- 日志新增timedRtLogger模板及自定义Logger调用接口
- 爬虫执行脚本新增thread支持
- 修改项目创建脚本,配置文件固定在conf目录

2017-06-13 v1.0.5

- 解决pypi版本问题。
Expand Down
2 changes: 1 addition & 1 deletion crwy/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.0.5
1.0.6
48 changes: 36 additions & 12 deletions crwy/commands/runspider.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,52 @@
from __future__ import print_function
import sys
import gevent
import threading
from optparse import OptionParser
from crwy.commands.list import Command as ListCommand


class Command(object):
def execute(self, spider_name, worker):
def execute(self, spider_name):
module = __import__('src.%s' % spider_name)
cls_obj = getattr(
getattr(module, spider_name), spider_name.capitalize() + 'Spider')

spider = cls_obj()
spider.worker = worker
res = spider.run()
return res

def multi_execute(self, spider_name, coroutine):
def multi_coroutine(self, spider_name, coroutine):
try:
coroutine = int(coroutine)
except ValueError:
print('ERROR: process must be int!!!')
print('ERROR: coroutine must be int!!!')
sys.exit(1)

from gevent import monkey
monkey.patch_all()

gevent.joinall([
gevent.spawn(self.execute, spider_name, 'worker%d' % i) for i in
range(coroutine)
gevent.spawn(self.execute, spider_name) for i in
xrange(coroutine)
])

def multi_thread(self, spider_name, thread):
try:
thread = int(thread)
except ValueError:
print('ERROR: thread must be int!!!')
sys.exit(1)

thread_list = []
for i in xrange(thread):
t = threading.Thread(target=self.execute, args=(spider_name,))
t.start()
thread_list.append(t)

for t in thread_list:
t.join()

def main(self):
Usage = "Usage: crwy runspider [option] [args]"
parser = OptionParser(Usage)
Expand All @@ -43,20 +59,28 @@ def main(self):
parser.add_option(
'-c', '--coroutine', dest='coroutine',
help='crawler by multi coroutine', metavar="COROUTINE")
parser.add_option(
'-t', '--thread', dest='thread',
help='crawler by multi thread', metavar="THREAD")
opt, args = parser.parse_args()

if len(args) < 1:
print(Usage)
sys.exit(1)

if opt.name is not None:
if opt.name in ListCommand.get_spider_list():
sys.path.append('.')

if opt.coroutine and opt.thread:
print("Can not run use both coroutine and thread!")
sys.exit(1)

if opt.coroutine is not None:
self.multi_execute(opt.name, opt.coroutine)
self.multi_coroutine(opt.name, opt.coroutine)
elif opt.thread is not None:
self.multi_thread(opt.name, opt.thread)
else:
self.execute(opt.name, 'worker0')
self.execute(opt.name)
else:
print('ERROR spider: "%s" is not found!!!' % opt.name)
sys.exit(1)
else:
print(Usage)
sys.exit(1)
10 changes: 5 additions & 5 deletions crwy/commands/startproject.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,27 @@
LOG_PATH = os.path.join(PATH, 'log')
CONFIG_PATH = os.path.join(PATH, 'crwy.cfg.tmpl')
SETTINGS_PATH = os.path.join(PATH, 'settings.py.tmpl')
LOGCONFIG_PATH = os.path.join(PATH, 'default_logger.conf.tmpl')
LOGCONFIG_PATH = os.path.join(PATH, 'logger.conf.tmpl')


class Command(object):
def create_project(self, project_name):
os.mkdir(project_name)
os.mkdir(project_name + '/' + project_name)
os.mkdir(project_name + '/conf')
shutil.copytree(DATA_PATH, project_name + '/' + 'data')
shutil.copytree(SRC_PATH, project_name + '/' + 'src')
shutil.copytree(LOG_PATH, project_name + '/' + 'log')
shutil.copy(LOGCONFIG_PATH, project_name + '/' + project_name + '/default_logger.conf')
shutil.copy(LOGCONFIG_PATH, project_name + '/conf/logger.conf')

config = change_project_name(project_name, CONFIG_PATH)
f1 = open(project_name + '/crwy.cfg', 'w')
f1.write(config)

settings = change_project_name(project_name, SETTINGS_PATH)
f2 = open(project_name + '/' + project_name + '/settings.py', 'w')
f2 = open(project_name + '/conf/settings.py', 'w')
f2.write(settings)

f3 = open(project_name + '/' + project_name + '/__init__.py', 'w')
f3 = open(project_name + '/conf/__init__.py', 'w')
f3.write('')

def main(self):
Expand Down
1 change: 1 addition & 0 deletions crwy/templates/project/data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
### 这是一个数据文件夹,用于储存来自sqlite的数据。
1 change: 0 additions & 1 deletion crwy/templates/project/data/data_path.txt

This file was deleted.

1 change: 1 addition & 0 deletions crwy/templates/project/log/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
### 这是一个日志文件夹,用于保存爬取日志
1 change: 0 additions & 1 deletion crwy/templates/project/log/log_path.txt

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#logger.conf
###############################################
[loggers]
keys=root,fileLogger,rtLogger
keys=root,fileLogger,rtLogger,timedRtLogger

[logger_root]
level=INFO
Expand All @@ -17,9 +17,14 @@ handlers=consoleHandler,rtHandler
qualname=rtLogger
propagate=0

[logger_timedRtLogger]
handlers=consoleHandler,timedRtHandler
qualname=timedRtLogger
propagate=0

###############################################
[handlers]
keys=consoleHandler,fileHandler,rtHandler
keys=consoleHandler,fileHandler,rtHandler,timedRtHandler

[handler_consoleHandler]
class=StreamHandler
Expand All @@ -37,17 +42,24 @@ args=('./log/default.log', 'a')
class=handlers.RotatingFileHandler
level=DEBUG
formatter=defaultFmt
args=('./log/default.log', 'a', 10*1024*1024, 5)
args=('./log/default.log', 'a', 100*1024*1024, 10)

[handler_timedRtHandler]
class=handlers.TimedRotatingFileHandler
level=DEBUG
formatter=defaultFmt
args=('./log/default.log', 'M', 1, 10)


###############################################

[formatters]
keys=defaultFmt,simpleFmt

[formatter_defaultFmt]
format=%(asctime)s %(filename)s %(funcName)s [line:%(lineno)d] %(levelname)s %(message)s
format=%(asctime)s %(filename)s %(funcName)s %(threadName)s [line:%(lineno)d] %(levelname)s %(message)s
datefmt=%Y-%m-%d %H:%M:%S

[formatter_simpleFmt]
format=%(name)-12s: %(levelname)-8s %(message)s
datefmt=
format=%(asctime)s %(threadName)s %(levelname)s %(message)s
datefmt=%Y-%m-%d %H:%M:%S
2 changes: 1 addition & 1 deletion crwy/templates/project/settings.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ import os

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

DATEBASE_DIR = os.path.join(BASE_DIR, 'data')
DATA_DIR = os.path.join(BASE_DIR, 'data')
7 changes: 3 additions & 4 deletions crwy/templates/spider/basic.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,11 @@ class ${class_name}Spider(Spider):
soups = self.html_parser.parser(response.content)
print(url)
print(soups)
self.logger.info('%s[%s] --> crawler success !!!' % (
self.spider_name, self.worker))
self.logger.info('%s --> crawler success !!!' % self.spider_name)

except Exception as e:
self.logger.exception('%s[%s] --> %s' % (
self.spider_name, self.worker, e))
self.logger.exception('%s --> %s' % (
self.spider_name, e))

def run(self):
self.crawler_${spider_name}()
8 changes: 4 additions & 4 deletions crwy/templates/spider/queue.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ class ${class_name}Spider(Spider):
print(soups)
print('Length of queue : %d' % queue.qsize())
else:
self.logger.info('%s[%s] --> crawler success !!!' % (
self.spider_name, self.worker))
self.logger.info('%s --> crawler success !!!' %
self.spider_name)
sys.exit()

except Exception as e:
self.logger.exception('%s[%s] --> %s' % (
self.spider_name, self.worker, e))
self.logger.exception('%s --> %s' % (
self.spider_name, e))
continue

def run(self):
Expand Down
8 changes: 4 additions & 4 deletions crwy/templates/spider/redis_queue.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ class ${class_name}Spider(Spider):
print(soups)
print('Length of queue : %s' % queue.qsize())
else:
self.logger.info('%s[%s] --> crawler success !!!' % (
self.spider_name, self.worker))
self.logger.info('%s --> crawler success !!!' %
self.spider_name)
sys.exit()

except Exception as e:
self.logger.exception('%s[%s] --> %s' % (
self.spider_name, self.worker, e))
self.logger.exception('%s --> %s' % (
self.spider_name, e))
continue

def add_queue(self):
Expand Down
8 changes: 4 additions & 4 deletions crwy/templates/spider/sqlite.py.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ class ${class_name}Spider(Spider):
self.sql.session.commit()
print(url)
print(soups)
self.logger.info('%s[%s] --> crawler success !!!' % (
self.spider_name, self.worker))
self.logger.info('%s --> crawler success !!!' %
self.spider_name)

except Exception as e:
self.logger.exception('%s[%s] --> %s' % (
self.spider_name, self.worker, e))
self.logger.exception('%s --> %s' % (
self.spider_name, e))

def run(self):
self.crawler_${spider_name}()
23 changes: 0 additions & 23 deletions crwy/utils/decorator.py

This file was deleted.

18 changes: 10 additions & 8 deletions crwy/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@

import logging
import logging.config
from configparser import NoSectionError

try:
from crwy.cmdline import get_project_name
except ImportError:
pass

try:
logging.config.fileConfig('./%s/default_logger.conf' % get_project_name())
except NoSectionError:
logging.config.fileConfig('./conf/logger.conf')
except:
pass


Expand All @@ -27,3 +21,11 @@ def file_logger():
@staticmethod
def rt_logger():
return logging.getLogger('rtLogger')

@staticmethod
def timed_rt_logger():
return logging.getLogger('timedRtLogger')

@staticmethod
def extra_logger(name=None):
return logging.getLogger(name)
8 changes: 5 additions & 3 deletions crwy/utils/mail.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: wuyue92tree@163.com

from __future__ import print_function

import smtplib
import traceback
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
Expand Down Expand Up @@ -62,6 +64,6 @@ def send_mail(self, mail_to, sub, content, subtype='plain', charset='utf8',
server.sendmail(self.mail_user, mail_to, msg.as_string())
server.close()
return True
except Exception, e:
print str(e)
except Exception as e:
traceback.format_exc(e)
return False
Loading

0 comments on commit c3e17c9

Please sign in to comment.