diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 4e2e5701..4b6f4273 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -29,8 +29,9 @@ jobs: - name: Run owl run: - poetry run python src/carrier-owl.py + poetry run python src/carrier_owl.py env: SLACK_ID: ${{ secrets.SLACK_ID }} + LINE_TOKEN: ${{ secrets.LINE_TOKEN }} diff --git a/README.md b/README.md index 8f2a6c9e..cbfa4fe2 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ 伝書フクロウという意味です。 ## About Carrier Owl + 前日のarxivから気になる論文にスコアを付けてslackに通知するシステムです。 @@ -42,21 +43,16 @@ - -3. **webhook urlの取得** - - 特定のslackチャンネルに流すための準備を行います。 - - incomming webhookの**webhook url**を取得してください。 - - 参考サイト - - [公式](https://slack.com/intl/ja-jp/help/articles/115005265063-Slack-での-Incoming-Webhook-の利用) - - [紹介記事](https://qiita.com/vmmhypervisor/items/18c99624a84df8b31008) - - slack通知の時のアイコンが設定できますので、よければこれ使ってください。 - - [icon](https://github.com/fkubota/Carrier-Owl/blob/master/data/images/carrier-owl.png) - +3. **通知先の設定** + 通知したいアプリケーションに応じて設定を行います。いくつでも設定可能です。 + + - Slackに通知する場合は[こちら](./docs/slack_setup.md) + - LINEに通知する場合は[こちら](./docs/line_setup.md) 4. **webhook urlの設定** - - step3で取得した `webhook url` を設定します。 + - step3で取得した `webhook url`(または `line token`) を設定します。 - 手順 a. `settings` をクリック。 @@ -67,29 +63,45 @@ c. `New repository secret` をクリック。 - d. Nameを `SLACK_ID` と入力。Valueを **step2** で取得した`webhook url`を貼り付けます。 + d. Nameを `SLACK_ID`(または `LINE_TOKEN` ) と入力。Valueを **step2** で取得した`webhook url`(また `line token`)を貼り付けます。 e. 最後に`Add secret`をクリックして登録完了です。 5. **領域の設定** + - 通知させたいarxivの論文の領域を指定します。 - **(computer scienceの人はこの手順を飛ばしてstep8に進んでも構いません)** - - `computer science` なら `cs` などそれぞれに名前がついています。以下の手順で確認します。 - 手順 - 1. [arxiv.org](https://arxiv.org)にアクセス - 2. 通知させたい領域の**resent**と書かれた部分をクリック。 - - - - 3. 遷移後のページのurlを見て、`list/`と`/recent`に囲われている文字列を使います。 - - - computer scienceの例: `https://arxiv.org/list/cs/recent` - - この場合、`cs` をこの後利用する。 - - 4. `config.yaml` 内の、`subject` を3で取得した文字列に変更します。(デフォルトでは`cs`になっています。) - + 1. 以下の表から通知を受け取りたいsubjectを選択して、urlをクリックしてください。 + + | subject | category | url | + | ------------------------------------------ | -------- | ----------------------------------------- | + | Astrophysics | astro-ph | [url](https://arxiv.org/archive/astro-ph) | + | Condensed Matter | cond-mat | [url](https://arxiv.org/archive/cond-mat) | + | Physics | physics | [url](https://arxiv.org/archive/physics) | + | Mathematics | math | [url](https://arxiv.org/archive/math) | + | Nonlinear Sciences | nlin | [url](https://arxiv.org/archive/nlin) | + | Computer Science | cs | [url](https://arxiv.org/archive/cs) | + | Quantitative Biology | q-bio | [url](https://arxiv.org/archive/q-bio) | + | Quantitative Finance | q-fin | [url](https://arxiv.org/archive/q-fin) | + | Statistics | stat | [url](https://arxiv.org/archive/stat) | + | Electrical Engineering and Systems Science | eess | [url](https://arxiv.org/archive/eess) | + | Economics | econ | [url](https://arxiv.org/archive/econ) | + + 2. さらに細かい分類を確認します + - 以下の例は、subject = `cs` をクリックした場合です。`cs.AI` や `cs.CL` などが細かな分類になります。 + + + + 3. `config.yaml` 内の、`subject` を2で確認した文字列に変更します。 + - デフォルトでは`cat:cs.*`になっています。これは、cs以下の小分類すべてを通知するという設定になります。 + - **複数領域指定** + - 複数領域指定も可能です。以下のよう `OR` でつなぎます。 + - ex1) `cat:cs.AI OR cat:cs.CV` + - ex2) `cat:physics.* OR cat:cs.*` + - ex3) `cat:physics.space-ph OR cat:cs.AI OR cat:q-bio.BM` 6. **キーワードの設定** @@ -119,11 +131,20 @@ - ここまでの変更がmasterブランチに反映されていれば、これですべての設定が完了したことになります。次の通知タイミングでslackに通知されます。 9. **test** - - 試しに動かしてみたい場合は、`master` ブランチから `test-send-to-slack` ブランチを作成してください。`test-send-to-slack` ブランチが作られるとgithub actionsが走って問題なければ通知されるはずです。 - - Actionsタブで様子を確認できます。 + - github actions の `workflow_dispatch` を使って通知タイミングを待たずにいつでも実行することができます。 + - Actionsタブでから、`Run workflow` をクリックすることでいつでも実行可能です。 + + - +## その他の設定 +- **score threshold** + 通知するスコアに閾値を設定することができます。`score >= scrore_threshold` を満たす論文のみ通知させることができま + す。 `config.yaml` 内の、`score_threshold` で設定できます(デフォルトは0になっています)。 ## Thanks -- [hppさん](https://github.com/hppRC)のお力をお借りして、v2.0.0から `github` だけで動作するようになりました。ご協力ありがとうございました。 +- [hppさん](https://github.com/hppRC)のPRにより、github actionsを使うことにより `github` だけで動作するようになりました。 + +- [wakamezakeさん](https://github.com/wakamezake)のPRにより、arxiv-apiを導入しました。 + +- [amagaeruさん](https://github.com/amagaeru1113)のPRにより、LINE通知機能を実装しました。 \ No newline at end of file diff --git a/config.yaml b/config.yaml index 27a6a48b..4b509c40 100644 --- a/config.yaml +++ b/config.yaml @@ -1,8 +1,11 @@ # arxivの学問領域の指定 -subject: 'cs' +subject: 'cat:cs.*' # 検索キーワード keywords: kaggle: 3 resnet: 3 anomaly detection: 1 + +# 通知の閾値 +score_threshold: 0 diff --git a/data/carrier-owl.gvdesign b/data/carrier-owl.gvdesign index 6dd785d8..4a4a5365 100644 Binary files a/data/carrier-owl.gvdesign and b/data/carrier-owl.gvdesign differ diff --git a/data/images/10.png b/data/images/10.png new file mode 100644 index 00000000..e05a8a92 Binary files /dev/null and b/data/images/10.png differ diff --git a/data/images/11.png b/data/images/11.png new file mode 100644 index 00000000..7cc45f04 Binary files /dev/null and b/data/images/11.png differ diff --git a/data/images/line/001.png b/data/images/line/001.png new file mode 100644 index 00000000..de666f84 Binary files /dev/null and b/data/images/line/001.png differ diff --git a/data/images/line/002.jpg b/data/images/line/002.jpg new file mode 100644 index 00000000..c46ec3f7 Binary files /dev/null and b/data/images/line/002.jpg differ diff --git a/data/images/system.png b/data/images/system.png index 85504abf..dcb41286 100644 Binary files a/data/images/system.png and b/data/images/system.png differ diff --git a/docs/line_setup.md b/docs/line_setup.md new file mode 100644 index 00000000..06569607 --- /dev/null +++ b/docs/line_setup.md @@ -0,0 +1,14 @@ +**LINE通知導入手順** + +1. LINE Notify(https://notify-bot.line.me/ja/) にアクセスし、ログイン + +2. ページ右上にある自分の名前をクリックし、マイページに移動 + +3. マイページ下部でアクセストークン発行 + + - 下記ページで発行 + + + + - 発行時、通知するトークルームを設定できるので同じトークルームに入っている複数名に通知可能 + diff --git a/docs/slack_setup.md b/docs/slack_setup.md new file mode 100644 index 00000000..18e9daf0 --- /dev/null +++ b/docs/slack_setup.md @@ -0,0 +1,15 @@ +**webhook urlの取得** + +特定のslackチャンネルに流すための準備を行います。 + +1. incomming webhookの**webhook url**を取得してください。 + - 参考サイト + - [公式](https://slack.com/intl/ja-jp/help/articles/115005265063-Slack-での-Incoming-Webhook-の利用) + + - [紹介記事](https://qiita.com/vmmhypervisor/items/18c99624a84df8b31008) + +2. slack通知の時のアイコンが設定できますので、よければこれ使ってください。 + - [icon](https://github.com/fkubota/Carrier-Owl/blob/master/data/images/carrier-owl.png) + + + diff --git a/poetry.lock b/poetry.lock index 3f900eeb..3ad5e86e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,70 +1,80 @@ [[package]] +name = "arxiv" +version = "0.5.3" +description = "Python wrapper for the arXiv API: http://arxiv.org/help/api/" category = "main" -description = "Screen-scraping library" -name = "beautifulsoup4" optional = false python-versions = "*" + +[package.dependencies] +feedparser = "*" +requests = "*" + +[[package]] +name = "beautifulsoup4" version = "4.9.3" +description = "Screen-scraping library" +category = "main" +optional = false +python-versions = "*" [package.dependencies] -[package.dependencies.soupsieve] -python = ">=3.0" -version = ">1.2" +soupsieve = {version = ">1.2", markers = "python_version >= \"3.0\""} [package.extras] html5lib = ["html5lib"] lxml = ["lxml"] [[package]] -category = "main" -description = "Python package for providing Mozilla's CA Bundle." name = "certifi" +version = "2020.12.5" +description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = "*" -version = "2020.12.5" [[package]] -category = "main" -description = "Universal encoding detector for Python 2 and 3" name = "chardet" +version = "4.0.0" +description = "Universal encoding detector for Python 2 and 3" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "4.0.0" [[package]] -category = "main" -description = "Installer for chromedriver." name = "chromedriver-binary" +version = "87.0.4280.88.0" +description = "Installer for chromedriver." +category = "main" optional = false python-versions = "*" -version = "87.0.4280.88.0" [[package]] +name = "feedparser" +version = "6.0.2" +description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" category = "main" -description = "A nested progress with plotting options for fastai" -name = "fastprogress" optional = false python-versions = ">=3.6" -version = "1.0.0" [package.dependencies] -numpy = "*" +sgmllib3k = "*" [[package]] -category = "main" -description = "Internationalized Domain Names in Applications (IDNA)" name = "idna" +version = "2.10" +description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -version = "2.10" [[package]] -category = "main" -description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." name = "lxml" +version = "4.6.2" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" -version = "4.6.2" [package.extras] cssselect = ["cssselect (>=0.7)"] @@ -73,28 +83,20 @@ htmlsoup = ["beautifulsoup4"] source = ["Cython (>=0.29.7)"] [[package]] -category = "main" -description = "NumPy is the fundamental package for array computing with Python." -name = "numpy" -optional = false -python-versions = ">=3.6" -version = "1.19.4" - -[[package]] -category = "main" -description = "YAML parser and emitter for Python" name = "pyyaml" +version = "5.3.1" +description = "YAML parser and emitter for Python" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "5.3.1" [[package]] -category = "main" -description = "Python HTTP for Humans." name = "requests" +version = "2.25.1" +description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -version = "2.25.1" [package.dependencies] certifi = ">=2017.4.17" @@ -107,62 +109,63 @@ security = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)"] socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7)", "win-inet-pton"] [[package]] -category = "main" -description = "Python bindings for Selenium" name = "selenium" +version = "3.141.0" +description = "Python bindings for Selenium" +category = "main" optional = false python-versions = "*" -version = "3.141.0" [package.dependencies] urllib3 = "*" [[package]] +name = "sgmllib3k" +version = "1.0.0" +description = "Py3k port of sgmllib." category = "main" -description = "slack bot for incomming webhook" -name = "slackweb" optional = false python-versions = "*" -version = "1.0.5" [[package]] +name = "slackweb" +version = "1.0.5" +description = "slack bot for incomming webhook" category = "main" -description = "A modern CSS selector implementation for Beautiful Soup." -marker = "python_version >= \"3.0\"" +optional = false +python-versions = "*" + +[[package]] name = "soupsieve" +version = "2.1" +description = "A modern CSS selector implementation for Beautiful Soup." +category = "main" optional = false python-versions = ">=3.5" -version = "2.1" [[package]] -category = "main" -description = "HTTP library with thread-safe connection pooling, file post, and more." name = "urllib3" +version = "1.26.2" +description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" -version = "1.26.2" [package.extras] brotli = ["brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] socks = ["PySocks (>=1.5.6,<1.5.7 || >1.5.7,<2.0)"] -[[package]] -category = "main" -description = "A built-package format for Python" -name = "wheel" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" -version = "0.36.2" - -[package.extras] -test = ["pytest (>=3.0.0)", "pytest-cov"] - [metadata] -content-hash = "05050a5175d90fa7d619468fece6d5edcba41af3ac12b314d64f2455056475aa" +lock-version = "1.1" python-versions = "^3.8" +content-hash = "7f15d7a1d2280c81c4a9e1596d9c34938d2d92cbbf3ec42113f1a12e8cce30b9" [metadata.files] +arxiv = [ + {file = "arxiv-0.5.3-py3-none-any.whl", hash = "sha256:98bdb4f74f4041b377d9c0e97c41e99ade89e37cf6b7890834419b1e2b152b4c"}, + {file = "arxiv-0.5.3.tar.gz", hash = "sha256:9010cf132bcfb67c07bd363c49d519365310f92ac9da1a06509f712798265987"}, +] beautifulsoup4 = [ {file = "beautifulsoup4-4.9.3-py2-none-any.whl", hash = "sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35"}, {file = "beautifulsoup4-4.9.3-py3-none-any.whl", hash = "sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666"}, @@ -179,9 +182,9 @@ chardet = [ chromedriver-binary = [ {file = "chromedriver-binary-87.0.4280.88.0.tar.gz", hash = "sha256:5c8c41176e4c6f10a7083697126624f6b5f491969a0c214541e89ed1991d6888"}, ] -fastprogress = [ - {file = "fastprogress-1.0.0-py3-none-any.whl", hash = "sha256:474cd6a6e5b1c29a02383d709bf71f502477d0849bddc6ba5aa80b683f4ad16f"}, - {file = "fastprogress-1.0.0.tar.gz", hash = "sha256:89e28ac1d2a5412aab18ee3f3dfd1ee8b5c1f2f7a44d0add0d0d4f69f0191bfe"}, +feedparser = [ + {file = "feedparser-6.0.2-py3-none-any.whl", hash = "sha256:f596c4b34fb3e2dc7e6ac3a8191603841e8d5d267210064e94d4238737452ddd"}, + {file = "feedparser-6.0.2.tar.gz", hash = "sha256:1b00a105425f492f3954fd346e5b524ca9cef3a4bbf95b8809470e9857aa1074"}, ] idna = [ {file = "idna-2.10-py2.py3-none-any.whl", hash = "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0"}, @@ -226,42 +229,6 @@ lxml = [ {file = "lxml-4.6.2-cp39-cp39-win_amd64.whl", hash = "sha256:535332fe9d00c3cd455bd3dd7d4bacab86e2d564bdf7606079160fa6251caacf"}, {file = "lxml-4.6.2.tar.gz", hash = "sha256:cd11c7e8d21af997ee8079037fff88f16fda188a9776eb4b81c7e4c9c0a7d7fc"}, ] -numpy = [ - {file = "numpy-1.19.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e9b30d4bd69498fc0c3fe9db5f62fffbb06b8eb9321f92cc970f2969be5e3949"}, - {file = "numpy-1.19.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fedbd128668ead37f33917820b704784aff695e0019309ad446a6d0b065b57e4"}, - {file = "numpy-1.19.4-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:8ece138c3a16db8c1ad38f52eb32be6086cc72f403150a79336eb2045723a1ad"}, - {file = "numpy-1.19.4-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:64324f64f90a9e4ef732be0928be853eee378fd6a01be21a0a8469c4f2682c83"}, - {file = "numpy-1.19.4-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:ad6f2ff5b1989a4899bf89800a671d71b1612e5ff40866d1f4d8bcf48d4e5764"}, - {file = "numpy-1.19.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:d6c7bb82883680e168b55b49c70af29b84b84abb161cbac2800e8fcb6f2109b6"}, - {file = "numpy-1.19.4-cp36-cp36m-win32.whl", hash = "sha256:13d166f77d6dc02c0a73c1101dd87fdf01339febec1030bd810dcd53fff3b0f1"}, - {file = "numpy-1.19.4-cp36-cp36m-win_amd64.whl", hash = "sha256:448ebb1b3bf64c0267d6b09a7cba26b5ae61b6d2dbabff7c91b660c7eccf2bdb"}, - {file = "numpy-1.19.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:27d3f3b9e3406579a8af3a9f262f5339005dd25e0ecf3cf1559ff8a49ed5cbf2"}, - {file = "numpy-1.19.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:16c1b388cc31a9baa06d91a19366fb99ddbe1c7b205293ed072211ee5bac1ed2"}, - {file = "numpy-1.19.4-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e5b6ed0f0b42317050c88022349d994fe72bfe35f5908617512cd8c8ef9da2a9"}, - {file = "numpy-1.19.4-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:18bed2bcb39e3f758296584337966e68d2d5ba6aab7e038688ad53c8f889f757"}, - {file = "numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:fe45becb4c2f72a0907c1d0246ea6449fe7a9e2293bb0e11c4e9a32bb0930a15"}, - {file = "numpy-1.19.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:6d7593a705d662be5bfe24111af14763016765f43cb6923ed86223f965f52387"}, - {file = "numpy-1.19.4-cp37-cp37m-win32.whl", hash = "sha256:6ae6c680f3ebf1cf7ad1d7748868b39d9f900836df774c453c11c5440bc15b36"}, - {file = "numpy-1.19.4-cp37-cp37m-win_amd64.whl", hash = "sha256:9eeb7d1d04b117ac0d38719915ae169aa6b61fca227b0b7d198d43728f0c879c"}, - {file = "numpy-1.19.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cb1017eec5257e9ac6209ac172058c430e834d5d2bc21961dceeb79d111e5909"}, - {file = "numpy-1.19.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:edb01671b3caae1ca00881686003d16c2209e07b7ef8b7639f1867852b948f7c"}, - {file = "numpy-1.19.4-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:f29454410db6ef8126c83bd3c968d143304633d45dc57b51252afbd79d700893"}, - {file = "numpy-1.19.4-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:ec149b90019852266fec2341ce1db513b843e496d5a8e8cdb5ced1923a92faab"}, - {file = "numpy-1.19.4-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:1aeef46a13e51931c0b1cf8ae1168b4a55ecd282e6688fdb0a948cc5a1d5afb9"}, - {file = "numpy-1.19.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:08308c38e44cc926bdfce99498b21eec1f848d24c302519e64203a8da99a97db"}, - {file = "numpy-1.19.4-cp38-cp38-win32.whl", hash = "sha256:5734bdc0342aba9dfc6f04920988140fb41234db42381cf7ccba64169f9fe7ac"}, - {file = "numpy-1.19.4-cp38-cp38-win_amd64.whl", hash = "sha256:09c12096d843b90eafd01ea1b3307e78ddd47a55855ad402b157b6c4862197ce"}, - {file = "numpy-1.19.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e452dc66e08a4ce642a961f134814258a082832c78c90351b75c41ad16f79f63"}, - {file = "numpy-1.19.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:a5d897c14513590a85774180be713f692df6fa8ecf6483e561a6d47309566f37"}, - {file = "numpy-1.19.4-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:a09f98011236a419ee3f49cedc9ef27d7a1651df07810ae430a6b06576e0b414"}, - {file = "numpy-1.19.4-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:50e86c076611212ca62e5a59f518edafe0c0730f7d9195fec718da1a5c2bb1fc"}, - {file = "numpy-1.19.4-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:f0d3929fe88ee1c155129ecd82f981b8856c5d97bcb0d5f23e9b4242e79d1de3"}, - {file = "numpy-1.19.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c42c4b73121caf0ed6cd795512c9c09c52a7287b04d105d112068c1736d7c753"}, - {file = "numpy-1.19.4-cp39-cp39-win32.whl", hash = "sha256:8cac8790a6b1ddf88640a9267ee67b1aee7a57dfa2d2dd33999d080bc8ee3a0f"}, - {file = "numpy-1.19.4-cp39-cp39-win_amd64.whl", hash = "sha256:4377e10b874e653fe96985c05feed2225c912e328c8a26541f7fc600fb9c637b"}, - {file = "numpy-1.19.4-pp36-pypy36_pp73-manylinux2010_x86_64.whl", hash = "sha256:2a2740aa9733d2e5b2dfb33639d98a64c3b0f24765fed86b0fd2aec07f6a0a08"}, - {file = "numpy-1.19.4.zip", hash = "sha256:141ec3a3300ab89c7f2b0775289954d193cc8edb621ea05f99db9cb181530512"}, -] pyyaml = [ {file = "PyYAML-5.3.1-cp27-cp27m-win32.whl", hash = "sha256:74809a57b329d6cc0fdccee6318f44b9b8649961fa73144a98735b0aaf029f1f"}, {file = "PyYAML-5.3.1-cp27-cp27m-win_amd64.whl", hash = "sha256:240097ff019d7c70a4922b6869d8a86407758333f02203e0fc6ff79c5dcede76"}, @@ -273,8 +240,6 @@ pyyaml = [ {file = "PyYAML-5.3.1-cp37-cp37m-win_amd64.whl", hash = "sha256:73f099454b799e05e5ab51423c7bcf361c58d3206fa7b0d555426b1f4d9a3eaf"}, {file = "PyYAML-5.3.1-cp38-cp38-win32.whl", hash = "sha256:06a0d7ba600ce0b2d2fe2e78453a470b5a6e000a985dd4a4e54e436cc36b0e97"}, {file = "PyYAML-5.3.1-cp38-cp38-win_amd64.whl", hash = "sha256:95f71d2af0ff4227885f7a6605c37fd53d3a106fcab511b8860ecca9fcf400ee"}, - {file = "PyYAML-5.3.1-cp39-cp39-win32.whl", hash = "sha256:ad9c67312c84def58f3c04504727ca879cb0013b2517c85a9a253f0cb6380c0a"}, - {file = "PyYAML-5.3.1-cp39-cp39-win_amd64.whl", hash = "sha256:6034f55dab5fea9e53f436aa68fa3ace2634918e8b5994d82f3621c04ff5ed2e"}, {file = "PyYAML-5.3.1.tar.gz", hash = "sha256:b8eac752c5e14d3eca0e6dd9199cd627518cb5ec06add0de9d32baeee6fe645d"}, ] requests = [ @@ -285,6 +250,9 @@ selenium = [ {file = "selenium-3.141.0-py2.py3-none-any.whl", hash = "sha256:2d7131d7bc5a5b99a2d9b04aaf2612c411b03b8ca1b1ee8d3de5845a9be2cb3c"}, {file = "selenium-3.141.0.tar.gz", hash = "sha256:deaf32b60ad91a4611b98d8002757f29e6f2c2d5fcaf202e1c9ad06d6772300d"}, ] +sgmllib3k = [ + {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, +] slackweb = [ {file = "slackweb-1.0.5.tar.gz", hash = "sha256:bb64f1f8fea99fc04ce8fb300a09aab8fe51d86e25cedafb1eaca07f7b828c36"}, ] @@ -296,7 +264,3 @@ urllib3 = [ {file = "urllib3-1.26.2-py2.py3-none-any.whl", hash = "sha256:d8ff90d979214d7b4f8ce956e80f4028fc6860e4431f731ea4a8c08f23f99473"}, {file = "urllib3-1.26.2.tar.gz", hash = "sha256:19188f96923873c92ccb987120ec4acaa12f0461fa9ce5d3d0772bc965a39e08"}, ] -wheel = [ - {file = "wheel-0.36.2-py2.py3-none-any.whl", hash = "sha256:78b5b185f0e5763c26ca1e324373aadd49182ca90e825f7853f4b2509215dc0e"}, - {file = "wheel-0.36.2.tar.gz", hash = "sha256:e11eefd162658ea59a60a0f6c7d493a7190ea4b9a85e335b33489d9f17e0245e"}, -] diff --git a/pyproject.toml b/pyproject.toml index 93dcb654..5305381d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,14 +8,12 @@ authors = ["hppRC "] python = "^3.8" wheel = "^0.36.2" slackweb = "^1.0.5" -requests = "^2.25.1" lxml = "^4.6.2" -numpy = "^1.19.4" -fastprogress = "^1.0.0" pyyaml = "^5.3.1" beautifulsoup4 = "^4.9.3" selenium = "^3.141.0" chromedriver-binary = "87.*" +arxiv = "^0.5.3" [tool.poetry.dev-dependencies] diff --git a/requirements.txt b/requirements.txt index a8dc28d6..03daa34c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,6 @@ numpy==1.19.4 fastprogress==1.0.0 pyyaml==5.3.1 beautifulsoup4==4.9.0 +chromedriver-binary==87.0.4280.88.0 +selenium==3.141.0 +arxiv==0.5.3 diff --git a/src/.ipynb_checkpoints/arxiv_notification-checkpoint.py b/src/.ipynb_checkpoints/arxiv_notification-checkpoint.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/carrier-owl.py b/src/carrier-owl.py deleted file mode 100644 index 2df3a3b3..00000000 --- a/src/carrier-owl.py +++ /dev/null @@ -1,188 +0,0 @@ -import chromedriver_binary # これは必ず入れる -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -import os -import time -import yaml -import datetime -import numpy as np -import textwrap -from bs4 import BeautifulSoup -import requests -from fastprogress import progress_bar -import slackweb -import warnings -import urllib.parse - -# setting -warnings.filterwarnings('ignore') - - -def get_articles_info(subject): - weekday_dict = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', - 4: 'Fri', 5: 'Sat', 6: 'Sun'} - url = f'https://arxiv.org/list/{subject}/pastweek?show=100000' - response = requests.get(url) - html = response.text - year = datetime.date.today().year - - # いつの論文データを取得するか - bs = BeautifulSoup(html) - h3 = bs.find_all('h3') - wd = weekday_dict[datetime.datetime.today().weekday()] - day = datetime.datetime.today().day - today = f'{wd}, {day}' - - # 今日、新しい論文が出てるかどうか(土日とか休みみたい) - if today in h3[0].text: - idx = 2 - else: - idx = 1 - articles_html = html.split(f'{year}')[idx] # <--------- 要注意 - - # 論文それぞれのurlを取得 - bs = BeautifulSoup(articles_html) - id_list = bs.find_all(class_='list-identifier') - return id_list - - -def serch_keywords(id_list, keywords_dict): - urls = [] - titles = [] - abstracts = [] - words = [] - scores = [] - for id_ in progress_bar(id_list): - a = id_.find('a') - _url = a.get('href') - url = 'https://arxiv.org'+_url - - response = requests.get(url) - html = response.text - - bs = BeautifulSoup(html) - title = bs.find('meta', attrs={'property': 'og:title'})['content'] - abstract = bs.find( - 'meta', - attrs={'property': 'og:description'})['content'] - - sum_score = 0 - hit_kwd_list = [] - - for word in keywords_dict.keys(): - score = keywords_dict[word] - if word.lower() in abstract.lower(): # 全部小文字にすれば、大文字少文字区別しなくていい - sum_score += score - hit_kwd_list.append(word) - if sum_score != 0: - title_trans = get_translated_text('ja', 'en', title) - abstract = abstract.replace('\n', '') - abstract_trans = get_translated_text('ja', 'en', abstract) - abstract_trans = textwrap.wrap(abstract_trans, 40) # 40行で改行 - abstract_trans = '\n'.join(abstract_trans) - - urls.append(url) - titles.append(title_trans) - abstracts.append(abstract_trans) - words.append(hit_kwd_list) - scores.append(sum_score) - - results = [urls, titles, abstracts, words, scores] - - return results - - -def send2slack(results, slack): - urls = results[0] - titles = results[1] - abstracts = results[2] - words = results[3] - scores = results[4] - - # rank - idxs_sort = np.argsort(scores) - idxs_sort = idxs_sort[::-1] - - # 通知 - star = '*'*120 - today = datetime.date.today() - text = f'{star}\n \t \t {today}\n{star}' - slack.notify(text=text) - for i in idxs_sort: - url = urls[i] - title = titles[i] - abstract = abstracts[i] - word = words[i] - score = scores[i] - - text_slack = f''' - \n score: `{score}`\n hit keywords: `{word}`\n url: {url}\n title: {title}\n abstract: \n \t {abstract}\n{star} - ''' - slack.notify(text=text_slack) - - -def get_translated_text(from_lang, to_lang, from_text): - ''' - https://qiita.com/fujino-fpu/items/e94d4ff9e7a5784b2987 - ''' - - sleep_time = 1 - - # urlencode - from_text = urllib.parse.quote(from_text) - - # url作成 - url = 'https://www.deepl.com/translator#' + from_lang + '/' + to_lang + '/' + from_text - - # ヘッドレスモードでブラウザを起動 - options = Options() - options.add_argument('--headless') - - # ブラウザーを起動 - driver = webdriver.Chrome(options=options) - driver.get(url) - driver.implicitly_wait(10) # 見つからないときは、10秒まで待つ - - for i in range(30): - # 指定時間待つ - time.sleep(sleep_time) - html = driver.page_source - to_text = get_text_from_page_source(html) - - try_count = i + 1 - if to_text: - wait_time = sleep_time * try_count - # アクセス終了 - break - - # ブラウザ停止 - driver.quit() - return to_text - - -def get_text_from_page_source(html): - soup = BeautifulSoup(html, features='lxml') - target_elem = soup.find(class_="lmt__translations_as_text__text_btn") - text = target_elem.text - return text - - -def get_config(): - file_abs_path = os.path.abspath(__file__) - file_dir = os.path.dirname(file_abs_path) - config_path = f'{file_dir}/../config.yaml' - with open(config_path, 'r') as yml: - config = yaml.load(yml) - return config - - -def main(): - config = get_config() - slack = slackweb.Slack(url=os.getenv("SLACK_ID")) - id_list = get_articles_info(config['subject']) - results = serch_keywords(id_list, config['keywords']) - send2slack(results, slack) - - -if __name__ == "__main__": - main() diff --git a/src/carrier_owl.py b/src/carrier_owl.py new file mode 100644 index 00000000..1f1552b8 --- /dev/null +++ b/src/carrier_owl.py @@ -0,0 +1,188 @@ +import chromedriver_binary # これは必ず入れる +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +import os +import time +import yaml +import datetime +import slackweb +import argparse +import textwrap +from bs4 import BeautifulSoup +import warnings +import urllib.parse +from dataclasses import dataclass +import arxiv +import requests +# setting +warnings.filterwarnings('ignore') + + +@dataclass +class Result: + url: str + title: str + abstract: str + words: list + score: float = 0.0 + + +def calc_score(abst: str, keywords: dict) -> (float, list): + sum_score = 0.0 + hit_kwd_list = [] + + for word in keywords.keys(): + score = keywords[word] + if word.lower() in abst.lower(): + sum_score += score + hit_kwd_list.append(word) + return sum_score, hit_kwd_list + + +def search_keyword( + articles: list, keywords: dict, score_threshold: float + ) -> list: + results = [] + + for article in articles: + url = article['arxiv_url'] + title = article['title'] + abstract = article['summary'] + score, hit_keywords = calc_score(abstract, keywords) + if (score != 0) and (score >= score_threshold): + title_trans = get_translated_text('ja', 'en', title) + abstract = abstract.replace('\n', '') + abstract_trans = get_translated_text('ja', 'en', abstract) + abstract_trans = textwrap.wrap(abstract_trans, 40) # 40行で改行 + abstract_trans = '\n'.join(abstract_trans) + result = Result( + url=url, title=title_trans, abstract=abstract_trans, + score=score, words=hit_keywords) + results.append(result) + return results + + +def send2app(text: str, slack_id: str, line_token: str) -> None: + # slack + if slack_id is not None: + slack = slackweb.Slack(url=slack_id) + slack.notify(text=text) + + # line + if line_token is not None: + line_notify_api = 'https://notify-api.line.me/api/notify' + headers = {'Authorization': f'Bearer {line_token}'} + data = {'message': f'message: {text}'} + requests.post(line_notify_api, headers=headers, data=data) + + +def notify(results: list, slack_id: str, line_token: str) -> None: + # 通知 + star = '*'*80 + today = datetime.date.today() + n_articles = len(results) + text = f'{star}\n \t \t {today}\tnum of articles = {n_articles}\n{star}' + send2app(text, slack_id, line_token) + # descending + for result in sorted(results, reverse=True, key=lambda x: x.score): + url = result.url + title = result.title + abstract = result.abstract + word = result.words + score = result.score + + text = f'\n score: `{score}`'\ + f'\n hit keywords: `{word}`'\ + f'\n url: {url}'\ + f'\n title: {title}'\ + f'\n abstract:'\ + f'\n \t {abstract}'\ + f'\n {star}' + + send2app(text, slack_id, line_token) + + +def get_translated_text(from_lang: str, to_lang: str, from_text: str) -> str: + ''' + https://qiita.com/fujino-fpu/items/e94d4ff9e7a5784b2987 + ''' + + sleep_time = 1 + + # urlencode + from_text = urllib.parse.quote(from_text) + + # url作成 + url = 'https://www.deepl.com/translator#' \ + + from_lang + '/' + to_lang + '/' + from_text + + # ヘッドレスモードでブラウザを起動 + options = Options() + options.add_argument('--headless') + + # ブラウザーを起動 + driver = webdriver.Chrome(options=options) + driver.get(url) + driver.implicitly_wait(10) # 見つからないときは、10秒まで待つ + + for i in range(30): + # 指定時間待つ + time.sleep(sleep_time) + html = driver.page_source + to_text = get_text_from_page_source(html) + + if to_text: + break + + # ブラウザ停止 + driver.quit() + return to_text + + +def get_text_from_page_source(html: str) -> str: + soup = BeautifulSoup(html, features='lxml') + target_elem = soup.find(class_="lmt__translations_as_text__text_btn") + text = target_elem.text + return text + + +def get_config() -> dict: + file_abs_path = os.path.abspath(__file__) + file_dir = os.path.dirname(file_abs_path) + config_path = f'{file_dir}/../config.yaml' + with open(config_path, 'r') as yml: + config = yaml.load(yml) + return config + + +def main(): + # debug用 + parser = argparse.ArgumentParser() + parser.add_argument('--slack_id', default=None) + parser.add_argument('--line_token', default=None) + args = parser.parse_args() + + config = get_config() + subject = config['subject'] + keywords = config['keywords'] + score_threshold = float(config['score_threshold']) + + yesterday = datetime.datetime.today() - datetime.timedelta(days=1) + yesterday_str = yesterday.strftime('%Y%m%d') + # datetime format YYYYMMDDHHMMSS + arxiv_query = f'({subject}) AND ' \ + f'submittedDate:' \ + f'[{yesterday_str}000000 TO {yesterday_str}235959]' + articles = arxiv.query(query=arxiv_query, + max_results=1000, + sort_by='submittedDate', + iterative=False) + results = search_keyword(articles, keywords, score_threshold) + + slack_id = os.getenv("SLACK_ID") or args.slack_id + line_token = os.getenv("LINE_TOKEN") or args.line_token + notify(results, slack_id, line_token) + + +if __name__ == "__main__": + main()