From 8d4d16176d08a131dc1868f21dc1e102e35fa7e8 Mon Sep 17 00:00:00 2001 From: stopmin Date: Mon, 15 Jul 2024 23:05:30 +0900 Subject: [PATCH 1/5] =?UTF-8?q?lightfm=20=EC=A3=BC=EC=84=9D=EC=B2=98?= =?UTF-8?q?=EB=A6=AC=20-=20=EC=B6=94=EA=B0=80=EC=A0=81=EC=9C=BC=EB=A1=9C?= =?UTF-8?q?=20plotly=20=EB=88=84=EB=9D=BD=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Pipfile | 2 +- Pipfile.lock | 188 +---------- app/recommend/recommend_service.py | 486 ++++++++++++++--------------- 3 files changed, 260 insertions(+), 416 deletions(-) diff --git a/Pipfile b/Pipfile index 19c456f..46c70f7 100644 --- a/Pipfile +++ b/Pipfile @@ -21,7 +21,6 @@ schedule = "*" feedparser = "*" numpy = "*" pandas = "*" -lightfm = "*" langchain = "*" chromadb = "*" langchain-community = "*" @@ -31,6 +30,7 @@ tiktoken = "*" langchain-openai = "*" langchain-google-community = "*" wikipedia-api = "*" +plotly = "*" [dev-packages] flake8 = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 97876d4..4073b9a 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "dced8354c32161ea8c87ca34ad9d1c148e9e6383483a94beef2ee1851f65672c" + "sha256": "fcbb46438648285b6c3ef92c5dc51e8fa5a0a7e1ce3bf3bf954770c6e84ef93f" }, "pipfile-spec": 6, "requires": { @@ -486,12 +486,12 @@ }, "fastapi": { "hashes": [ - "sha256:97ecbf994be0bcbdadedf88c3150252bed7b2087075ac99735403b1b76cc8fc0", - "sha256:b9db9dd147c91cb8b769f7183535773d8741dd46f9dc6676cd82eab510228cd7" + "sha256:4f51cfa25d72f9fbc3280832e84b32494cf186f50158d364a8765aabf22587bf", + "sha256:ddd1ac34cb1f76c2e2d7f8545a4bcb5463bce4834e81abf0b189e0c359ab2413" ], "index": "pypi", "markers": "python_version >= '3.8'", - "version": "==0.111.0" + "version": "==0.111.1" }, "fastapi-cli": { "hashes": [ @@ -911,14 +911,6 @@ "markers": "python_version >= '3.7'", "version": "==3.1.4" }, - "joblib": { - "hashes": [ - "sha256:06d478d5674cbc267e7496a410ee875abd68e4340feff4490bcb7afb88060ae6", - "sha256:2382c5816b2636fbd20a09e0f4e9dad4736765fdfb7dca582943b9c1366b3f0e" - ], - "markers": "python_version >= '3.8'", - "version": "==1.4.2" - }, "jsonpatch": { "hashes": [ "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade", @@ -963,11 +955,11 @@ }, "langchain-core": { "hashes": [ - "sha256:f8b4f64c95b381bd3ac4322bafa9ecee3ca3970aa0aa2829ef11175bcad27249", - "sha256:f9b8d0f7b5f339ac62e11c417181e87f54e9282af82d1a64c8a7427f3d5d9118" + "sha256:c9dbb197508e76337ed810ec977d40ae0c896397d191b420ef126c3818a1be96", + "sha256:ca5c5f1a783449dae8686e366ff3c5b775f8b5cef0de4ef346b8820d3d1c46ff" ], "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", - "version": "==0.2.17" + "version": "==0.2.18" }, "langchain-google-community": { "hashes": [ @@ -1003,13 +995,6 @@ "markers": "python_version < '4.0' and python_full_version >= '3.8.1'", "version": "==0.1.85" }, - "lightfm": { - "hashes": [ - "sha256:2b77ada182ccd768a8d7643ab3cfcd8b6e855db09087f7cc7329bd63316697a8" - ], - "index": "pypi", - "version": "==1.17" - }, "loguru": { "hashes": [ "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb", @@ -1591,6 +1576,15 @@ "markers": "python_version >= '3.9'", "version": "==2.2.2" }, + "plotly": { + "hashes": [ + "sha256:68fc1901f098daeb233cc3dd44ec9dc31fb3ca4f4e53189344199c43496ed006", + "sha256:859fdadbd86b5770ae2466e542b761b247d1c6b49daed765b95bb8c7063e7469" + ], + "index": "pypi", + "markers": "python_version >= '3.8'", + "version": "==5.22.0" + }, "posthog": { "hashes": [ "sha256:3c672be7ba6f95d555ea207d4486c171d06657eb34b3ce25eb043bfe7b6b5b76", @@ -1986,64 +1980,6 @@ "markers": "python_version >= '3.7'", "version": "==1.2.2" }, - "scikit-learn": { - "hashes": [ - "sha256:0828673c5b520e879f2af6a9e99eee0eefea69a2188be1ca68a6121b809055c1", - "sha256:0ea5d40c0e3951df445721927448755d3fe1d80833b0b7308ebff5d2a45e6414", - "sha256:10e49170691514a94bb2e03787aa921b82dbc507a4ea1f20fd95557862c98dc1", - "sha256:154297ee43c0b83af12464adeab378dee2d0a700ccd03979e2b821e7dd7cc1c2", - "sha256:161808750c267b77b4a9603cf9c93579c7a74ba8486b1336034c2f1579546d21", - "sha256:1bd8d3a19d4bd6dc5a7d4f358c8c3a60934dc058f363c34c0ac1e9e12a31421d", - "sha256:1ff4ba34c2abff5ec59c803ed1d97d61b036f659a17f55be102679e88f926fac", - "sha256:508907e5f81390e16d754e8815f7497e52139162fd69c4fdbd2dfa5d6cc88915", - "sha256:5944ce1faada31c55fb2ba20a5346b88e36811aab504ccafb9f0339e9f780395", - "sha256:5f57428de0c900a98389c4a433d4a3cf89de979b3aa24d1c1d251802aa15e44d", - "sha256:689b6f74b2c880276e365fe84fe4f1befd6a774f016339c65655eaff12e10cbf", - "sha256:781586c414f8cc58e71da4f3d7af311e0505a683e112f2f62919e3019abd3745", - "sha256:7b073a27797a283187a4ef4ee149959defc350b46cbf63a84d8514fe16b69855", - "sha256:88e0672c7ac21eb149d409c74cc29f1d611d5158175846e7a9c2427bd12b3956", - "sha256:909144d50f367a513cee6090873ae582dba019cb3fca063b38054fa42704c3a4", - "sha256:97625f217c5c0c5d0505fa2af28ae424bd37949bb2f16ace3ff5f2f81fb4498b", - "sha256:9a07f90846313a7639af6a019d849ff72baadfa4c74c778821ae0fad07b7275b", - "sha256:b59e3e62d2be870e5c74af4e793293753565c7383ae82943b83383fdcf5cc5c1", - "sha256:b5e865e9bd59396220de49cb4a57b17016256637c61b4c5cc81aaf16bc123bbe", - "sha256:da3f404e9e284d2b0a157e1b56b6566a34eb2798205cba35a211df3296ab7a74", - "sha256:f5b213bc29cc30a89a3130393b0e39c847a15d769d6e59539cd86b75d276b1a7" - ], - "markers": "python_version >= '3.9'", - "version": "==1.5.1" - }, - "scipy": { - "hashes": [ - "sha256:076c27284c768b84a45dcf2e914d4000aac537da74236a0d45d82c6fa4b7b3c0", - "sha256:07e179dc0205a50721022344fb85074f772eadbda1e1b3eecdc483f8033709b7", - "sha256:176c6f0d0470a32f1b2efaf40c3d37a24876cebf447498a4cefb947a79c21e9d", - "sha256:42470ea0195336df319741e230626b6225a740fd9dce9642ca13e98f667047c0", - "sha256:4c4161597c75043f7154238ef419c29a64ac4a7c889d588ea77690ac4d0d9b20", - "sha256:5b083c8940028bb7e0b4172acafda6df762da1927b9091f9611b0bcd8676f2bc", - "sha256:64b2ff514a98cf2bb734a9f90d32dc89dc6ad4a4a36a312cd0d6327170339eb0", - "sha256:65df4da3c12a2bb9ad52b86b4dcf46813e869afb006e58be0f516bc370165159", - "sha256:687af0a35462402dd851726295c1a5ae5f987bd6e9026f52e9505994e2f84ef6", - "sha256:6a9c9a9b226d9a21e0a208bdb024c3982932e43811b62d202aaf1bb59af264b1", - "sha256:6d056a8709ccda6cf36cdd2eac597d13bc03dba38360f418560a93050c76a16e", - "sha256:7d3da42fbbbb860211a811782504f38ae7aaec9de8764a9bef6b262de7a2b50f", - "sha256:7e911933d54ead4d557c02402710c2396529540b81dd554fc1ba270eb7308484", - "sha256:94c164a9e2498e68308e6e148646e486d979f7fcdb8b4cf34b5441894bdb9caf", - "sha256:9e3154691b9f7ed73778d746da2df67a19d046a6c8087c8b385bc4cdb2cfca74", - "sha256:9eee2989868e274aae26125345584254d97c56194c072ed96cb433f32f692ed8", - "sha256:a01cc03bcdc777c9da3cfdcc74b5a75caffb48a6c39c8450a9a05f82c4250a14", - "sha256:a7d46c3e0aea5c064e734c3eac5cf9eb1f8c4ceee756262f2c7327c4c2691c86", - "sha256:ad36af9626d27a4326c8e884917b7ec321d8a1841cd6dacc67d2a9e90c2f0359", - "sha256:b5923f48cb840380f9854339176ef21763118a7300a88203ccd0bdd26e58527b", - "sha256:bbc0471b5f22c11c389075d091d3885693fd3f5e9a54ce051b46308bc787e5d4", - "sha256:bff2438ea1330e06e53c424893ec0072640dac00f29c6a43a575cbae4c99b2b9", - "sha256:c40003d880f39c11c1edbae8144e3813904b10514cd3d3d00c277ae996488cdb", - "sha256:d91db2c41dd6c20646af280355d41dfa1ec7eead235642178bd57635a3f82209", - "sha256:f0a50da861a7ec4573b7c716b2ebdcdf142b66b756a0d392c236ae568b3a93fb" - ], - "markers": "python_version >= '3.10'", - "version": "==1.14.0" - }, "setuptools": { "hashes": [ "sha256:f171bab1dfbc86b132997f26a119f6056a57950d058587841a0082e8830f9dc5", @@ -2170,14 +2106,6 @@ "markers": "python_version >= '3.8'", "version": "==8.3.0" }, - "threadpoolctl": { - "hashes": [ - "sha256:082433502dd922bf738de0d8bcc4fdcbf0979ff44c42bd40f5af8a282f6fa107", - "sha256:56c1e26c150397e58c4926da8eeee87533b1e32bef131bd4bf6a2f45f3185467" - ], - "markers": "python_version >= '3.8'", - "version": "==3.5.0" - }, "tiktoken": { "hashes": [ "sha256:03c6c40ff1db0f48a7b4d2dafeae73a5607aacb472fa11f125e7baf9dce73704", @@ -2366,90 +2294,6 @@ "markers": "python_version >= '2'", "version": "==2024.1" }, - "ujson": { - "hashes": [ - "sha256:0de4971a89a762398006e844ae394bd46991f7c385d7a6a3b93ba229e6dac17e", - "sha256:129e39af3a6d85b9c26d5577169c21d53821d8cf68e079060602e861c6e5da1b", - "sha256:22cffecf73391e8abd65ef5f4e4dd523162a3399d5e84faa6aebbf9583df86d6", - "sha256:232cc85f8ee3c454c115455195a205074a56ff42608fd6b942aa4c378ac14dd7", - "sha256:2544912a71da4ff8c4f7ab5606f947d7299971bdd25a45e008e467ca638d13c9", - "sha256:2601aa9ecdbee1118a1c2065323bda35e2c5a2cf0797ef4522d485f9d3ef65bd", - "sha256:26b0e2d2366543c1bb4fbd457446f00b0187a2bddf93148ac2da07a53fe51569", - "sha256:2987713a490ceb27edff77fb184ed09acdc565db700ee852823c3dc3cffe455f", - "sha256:29b443c4c0a113bcbb792c88bea67b675c7ca3ca80c3474784e08bba01c18d51", - "sha256:2a890b706b64e0065f02577bf6d8ca3b66c11a5e81fb75d757233a38c07a1f20", - "sha256:2aff2985cef314f21d0fecc56027505804bc78802c0121343874741650a4d3d1", - "sha256:348898dd702fc1c4f1051bc3aacbf894caa0927fe2c53e68679c073375f732cf", - "sha256:38665e7d8290188b1e0d57d584eb8110951a9591363316dd41cf8686ab1d0abc", - "sha256:38d5d36b4aedfe81dfe251f76c0467399d575d1395a1755de391e58985ab1c2e", - "sha256:3ff201d62b1b177a46f113bb43ad300b424b7847f9c5d38b1b4ad8f75d4a282a", - "sha256:4573fd1695932d4f619928fd09d5d03d917274381649ade4328091ceca175539", - "sha256:4734ee0745d5928d0ba3a213647f1c4a74a2a28edc6d27b2d6d5bd9fa4319e27", - "sha256:4c4fc16f11ac1612f05b6f5781b384716719547e142cfd67b65d035bd85af165", - "sha256:502bf475781e8167f0f9d0e41cd32879d120a524b22358e7f205294224c71126", - "sha256:57aaf98b92d72fc70886b5a0e1a1ca52c2320377360341715dd3933a18e827b1", - "sha256:59e02cd37bc7c44d587a0ba45347cc815fb7a5fe48de16bf05caa5f7d0d2e816", - "sha256:5b6fee72fa77dc172a28f21693f64d93166534c263adb3f96c413ccc85ef6e64", - "sha256:5b91b5d0d9d283e085e821651184a647699430705b15bf274c7896f23fe9c9d8", - "sha256:604a046d966457b6cdcacc5aa2ec5314f0e8c42bae52842c1e6fa02ea4bda42e", - "sha256:618efd84dc1acbd6bff8eaa736bb6c074bfa8b8a98f55b61c38d4ca2c1f7f287", - "sha256:61d0af13a9af01d9f26d2331ce49bb5ac1fb9c814964018ac8df605b5422dcb3", - "sha256:61e1591ed9376e5eddda202ec229eddc56c612b61ac6ad07f96b91460bb6c2fb", - "sha256:621e34b4632c740ecb491efc7f1fcb4f74b48ddb55e65221995e74e2d00bbff0", - "sha256:6627029ae4f52d0e1a2451768c2c37c0c814ffc04f796eb36244cf16b8e57043", - "sha256:67079b1f9fb29ed9a2914acf4ef6c02844b3153913eb735d4bf287ee1db6e557", - "sha256:6dea1c8b4fc921bf78a8ff00bbd2bfe166345f5536c510671bccececb187c80e", - "sha256:6e32abdce572e3a8c3d02c886c704a38a1b015a1fb858004e03d20ca7cecbb21", - "sha256:7223f41e5bf1f919cd8d073e35b229295aa8e0f7b5de07ed1c8fddac63a6bc5d", - "sha256:73814cd1b9db6fc3270e9d8fe3b19f9f89e78ee9d71e8bd6c9a626aeaeaf16bd", - "sha256:7490655a2272a2d0b072ef16b0b58ee462f4973a8f6bbe64917ce5e0a256f9c0", - "sha256:7663960f08cd5a2bb152f5ee3992e1af7690a64c0e26d31ba7b3ff5b2ee66337", - "sha256:78778a3aa7aafb11e7ddca4e29f46bc5139131037ad628cc10936764282d6753", - "sha256:7c10f4654e5326ec14a46bcdeb2b685d4ada6911050aa8baaf3501e57024b804", - "sha256:7ec0ca8c415e81aa4123501fee7f761abf4b7f386aad348501a26940beb1860f", - "sha256:924f7318c31874d6bb44d9ee1900167ca32aa9b69389b98ecbde34c1698a250f", - "sha256:94a87f6e151c5f483d7d54ceef83b45d3a9cca7a9cb453dbdbb3f5a6f64033f5", - "sha256:98ba15d8cbc481ce55695beee9f063189dce91a4b08bc1d03e7f0152cd4bbdd5", - "sha256:a245d59f2ffe750446292b0094244df163c3dc96b3ce152a2c837a44e7cda9d1", - "sha256:a5b366812c90e69d0f379a53648be10a5db38f9d4ad212b60af00bd4048d0f00", - "sha256:a65b6af4d903103ee7b6f4f5b85f1bfd0c90ba4eeac6421aae436c9988aa64a2", - "sha256:a984a3131da7f07563057db1c3020b1350a3e27a8ec46ccbfbf21e5928a43050", - "sha256:a9d2edbf1556e4f56e50fab7d8ff993dbad7f54bac68eacdd27a8f55f433578e", - "sha256:ab13a2a9e0b2865a6c6db9271f4b46af1c7476bfd51af1f64585e919b7c07fd4", - "sha256:ac56eb983edce27e7f51d05bc8dd820586c6e6be1c5216a6809b0c668bb312b8", - "sha256:ad88ac75c432674d05b61184178635d44901eb749786c8eb08c102330e6e8996", - "sha256:b0111b27f2d5c820e7f2dbad7d48e3338c824e7ac4d2a12da3dc6061cc39c8e6", - "sha256:b3cd8f3c5d8c7738257f1018880444f7b7d9b66232c64649f562d7ba86ad4bc1", - "sha256:b9500e61fce0cfc86168b248104e954fead61f9be213087153d272e817ec7b4f", - "sha256:ba17799fcddaddf5c1f75a4ba3fd6441f6a4f1e9173f8a786b42450851bd74f1", - "sha256:ba43cc34cce49cf2d4bc76401a754a81202d8aa926d0e2b79f0ee258cb15d3a4", - "sha256:baed37ea46d756aca2955e99525cc02d9181de67f25515c468856c38d52b5f3b", - "sha256:beeaf1c48e32f07d8820c705ff8e645f8afa690cca1544adba4ebfa067efdc88", - "sha256:c18610b9ccd2874950faf474692deee4223a994251bc0a083c114671b64e6518", - "sha256:c66962ca7565605b355a9ed478292da628b8f18c0f2793021ca4425abf8b01e5", - "sha256:caf270c6dba1be7a41125cd1e4fc7ba384bf564650beef0df2dd21a00b7f5770", - "sha256:cc6139531f13148055d691e442e4bc6601f6dba1e6d521b1585d4788ab0bfad4", - "sha256:d2c75269f8205b2690db4572a4a36fe47cd1338e4368bc73a7a0e48789e2e35a", - "sha256:d47ebb01bd865fdea43da56254a3930a413f0c5590372a1241514abae8aa7c76", - "sha256:d4dc2fd6b3067c0782e7002ac3b38cf48608ee6366ff176bbd02cf969c9c20fe", - "sha256:d7d0e0ceeb8fe2468c70ec0c37b439dd554e2aa539a8a56365fd761edb418988", - "sha256:d8640fb4072d36b08e95a3a380ba65779d356b2fee8696afeb7794cf0902d0a1", - "sha256:dee5e97c2496874acbf1d3e37b521dd1f307349ed955e62d1d2f05382bc36dd5", - "sha256:dfef2814c6b3291c3c5f10065f745a1307d86019dbd7ea50e83504950136ed5b", - "sha256:e1402f0564a97d2a52310ae10a64d25bcef94f8dd643fcf5d310219d915484f7", - "sha256:e7ce306a42b6b93ca47ac4a3b96683ca554f6d35dd8adc5acfcd55096c8dfcb8", - "sha256:e82d4bb2138ab05e18f089a83b6564fee28048771eb63cdecf4b9b549de8a2cc", - "sha256:ecb24f0bdd899d368b715c9e6664166cf694d1e57be73f17759573a6986dd95a", - "sha256:f00ea7e00447918ee0eff2422c4add4c5752b1b60e88fcb3c067d4a21049a720", - "sha256:f3caf9cd64abfeb11a3b661329085c5e167abbe15256b3b68cb5d914ba7396f3", - "sha256:f44bd4b23a0e723bf8b10628288c2c7c335161d6840013d4d5de20e48551773b", - "sha256:f77b74475c462cb8b88680471193064d3e715c7c6074b1c8c412cb526466efe9", - "sha256:f8ccb77b3e40b151e20519c6ae6d89bfe3f4c14e8e210d910287f778368bb3d1", - "sha256:fbd8fd427f57a03cff3ad6574b5e299131585d9727c8c366da4624a9069ed746" - ], - "markers": "python_version >= '3.8'", - "version": "==5.10.0" - }, "uritemplate": { "hashes": [ "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0", diff --git a/app/recommend/recommend_service.py b/app/recommend/recommend_service.py index d0f9a9f..d2b391e 100644 --- a/app/recommend/recommend_service.py +++ b/app/recommend/recommend_service.py @@ -1,243 +1,243 @@ -# pylint: disable=missing-module-docstring, missing-module-docstring, attribute-defined-outside-init, unnecessary-comprehension, not-callable, consider-using-f-string, unused-variable - -import asyncio -import os -import warnings -from datetime import datetime - -import numpy as np -import pandas as pd -from fastapi import Depends -from sqlalchemy.ext.asyncio import AsyncSession - -from app.database.repository import model_to_dict -from app.database.session import get_db_session -from app.model.crawled_article import Articles -from app.repository.crawled_article_crud import CrawledArticleRepository -from app.service.article_manage_service import ArticleManageService -from app.repository.interaction_crud import InteractionRepository -from app.model.interaction import Interaction -from lightfm import LightFM -from lightfm.data import Dataset # pylint: disable=E0611 - -warnings.filterwarnings("ignore") - - -def articles_to_dataframe(articles: list[Articles]) -> pd.DataFrame: - # 객체 리스트를 딕셔너리 리스트로 변환 - articles_dict_list = [ - { - "article_id": article.id, - 'ECONOMY_AND_BUSINESS': 0, - 'POLITICS_AND_SOCIETY': 0, - 'SPORTS_AND_LEISURE': 0, - 'TECHNOLOGY_AND_CULTURE': 0 - # "created_at": article.created_at.strftime('%Y-%m-%d'), - } - for article in articles - ] - for i in range(len(articles_dict_list)): - articles_dict_list[i][articles[i].category] = 1 - - df = pd.DataFrame(articles_dict_list) - return df - -def interaction_to_dataframe(interactions : list[Interaction]) -> pd.DataFrame: - interaction_dict_list = [ - { - "classification_id": interaction.classification_id, - "article_id": interaction.article_id, - "duration_time": interaction.duration_time - } - for interaction in interactions - ] - df = pd.DataFrame(interaction_dict_list) - return df - -class ArticleDataInfo: - def __init__(self, article_id, category, created_at): - self.article_data = pd.DataFrame( - { - "article_id": article_id, - "경제 및 기업": [0], - "정치 및 사회": [0], - "기술 및 문화": [0], - "스포츠 및 여가": [0], - "오피니언 및 분석": [0], - # "created_at": [created_at], - } - ) - - self.article_data.iloc[0][category] = 1 - - -class InteractionDataInfo: - def __init__(self, user_id, article_id, duration_time): - self.interaction_data = pd.DataFrame( - { - "classification_id": [user_id], - "article_id": [article_id], - "duration_time": [duration_time], - } - ) - - -class RecommendService: - # pylint: disable=too-many-instance-attributes - - def __init__(self): - self.interaction_datas = None - self.num_classification = 5 - - def set_user_datas(self, user_data_path): - self.user_data_path = user_data_path - self.user_datas = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + user_data_path) - - - async def initialize_data(self, session): - self.set_user_datas("/./user_classification.csv") - await self.set_article_datas(session) - await self.set_interaction_datas(session) - - async def set_article_datas(self, session): - # session = Depends(get_db_session) - articles = await ArticleManageService().get_all_articles(session=session) - self.article_datas = pd.get_dummies(articles_to_dataframe(articles)) - - async def set_interaction_datas(self, session): - # session = Depends(get_db_session) - interactions = await InteractionRepository().get_all(session=session) - self.interaction_datas = interaction_to_dataframe(interactions) - print(self.interaction_datas.columns) - - def make_dataset(self): - self.user_datas = pd.get_dummies(self.user_datas) - self.user_features_col = self.user_datas.drop( - columns=["classification_id"] - ).columns.values - self.user_feat = self.user_datas.drop(columns=["classification_id"]).to_dict( - orient="records" - ) - - self.item_features = self.article_datas - self.item_features_col = self.item_features.drop( - columns=["article_id"] - ).columns.values - self.item_feat = self.item_features.drop( - columns=["article_id"] - ).to_dict(orient="records") - - self.dataset = Dataset() - self.dataset.fit( - users=[x for x in self.user_datas["classification_id"]], - items=[x for x in self.article_datas["article_id"]], - item_features=self.item_features_col, - user_features=self.user_features_col, - ) - - print(self.item_feat) - self.item_features = self.dataset.build_item_features( - (x, y) for x, y in zip(self.item_features["article_id"], self.item_feat) - ) - self.user_features = self.dataset.build_user_features( - (x, y) for x, y in zip(self.user_datas["classification_id"], self.user_feat) - ) - - (self.interactions, self.weights) = self.dataset.build_interactions( - (x, y, z) - for x, y, z in zip( - self.interaction_datas["classification_id"], - self.interaction_datas["article_id"], - self.interaction_datas["duration_time"], - ) - ) - - num_users, num_items = self.dataset.interactions_shape() - print("Num users: {}, num_items {}.".format(num_users, num_items)) - - def make_model( - self, - n_components: int = 30, - loss: str = "warp", - epoch: int = 30, - num_thread: int = 4, - ): - self.n_components = n_components - self.loss = loss - self.epoch = epoch - self.num_thread = num_thread - self.model = LightFM( - no_components=self.n_components, loss=self.loss, random_state=1616 - ) - - def fit_model(self): - self.make_dataset() - self.make_model() - self.model.fit( - self.interactions, - user_features=self.user_features, - item_features=self.item_features, - epochs=self.epoch, - num_threads=self.num_thread, - sample_weight=self.weights, - ) - - def get_top_n_articles(self, user_id: int, article_num: int): - item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 - - predictions = self.model.predict(user_id, item_ids) - top_items = self.article_datas.iloc[np.argsort(-predictions)[:article_num]] - return top_items - - def similar_items(self, item_id, N=10): - item_bias, item_representations = self.model.get_item_representations( - features=self.item_features - ) - - scores = item_representations.dot(item_representations[item_id, :]) - best = np.argpartition(scores, -N)[-N:] - - return self.article_datas.iloc[best] - - async def get_classification_for_article(self, article_id:id, session:AsyncSession): - scores = self.model.predict(np.arange(len(self.user_datas)), np.full(len(self.user_datas), article_id)) - top_users = np.argsort(-scores) - - score_for_classification = [0 for _ in range(self.num_classification)] - weight = 10 - for user_id in top_users[:10]: - for i in range(self.num_classification): - score_for_classification[i] += self.user_datas.iloc[user_id][self.user_datas.columns[i+2]] * (2 ** weight) - weight -= 1 - - total = sum(score_for_classification) - for i in range(self.num_classification): - score_for_classification[i] = (int)(score_for_classification[i] / (total/100)) - - await CrawledArticleRepository().set_interest_type(article_id, score_for_classification, session) - - return score_for_classification - - def get_time_weight(self, article_id): - today = datetime.now().date() - date_obj = datetime.strptime( - self.article_datas[self.article_datas["article_id"] == article_id][ - "created_at" - ].iloc[0], - "%Y-%m-%d", - ).date() - difference = today - date_obj - return max(1 - ((difference.days // 30) / 5), 0) - - def fit_model_partialy(self): - self.make_dataset() - self.model.fit_partial(self.interactions, item_features=self.item_features) - - def add_interaction_data(self, interaction_data: InteractionDataInfo): - InteractionRepository().create( - Interaction( - classification_id=interaction_data.interaction_data['classification_id'], - article_id=interaction_data.interaction_data['article_id'], - duration_time=interaction_data.interaction_data['duration_time'] - ) - ) \ No newline at end of file +# # pylint: disable=missing-module-docstring, missing-module-docstring, attribute-defined-outside-init, unnecessary-comprehension, not-callable, consider-using-f-string, unused-variable +# +# import asyncio +# import os +# import warnings +# from datetime import datetime +# +# import numpy as np +# import pandas as pd +# from fastapi import Depends +# from sqlalchemy.ext.asyncio import AsyncSession +# +# from app.database.repository import model_to_dict +# from app.database.session import get_db_session +# from app.model.crawled_article import Articles +# from app.repository.crawled_article_crud import CrawledArticleRepository +# from app.service.article_manage_service import ArticleManageService +# from app.repository.interaction_crud import InteractionRepository +# from app.model.interaction import Interaction +# from lightfm import LightFM +# from lightfm.data import Dataset # pylint: disable=E0611 +# +# warnings.filterwarnings("ignore") +# +# +# def articles_to_dataframe(articles: list[Articles]) -> pd.DataFrame: +# # 객체 리스트를 딕셔너리 리스트로 변환 +# articles_dict_list = [ +# { +# "article_id": article.id, +# 'ECONOMY_AND_BUSINESS': 0, +# 'POLITICS_AND_SOCIETY': 0, +# 'SPORTS_AND_LEISURE': 0, +# 'TECHNOLOGY_AND_CULTURE': 0 +# # "created_at": article.created_at.strftime('%Y-%m-%d'), +# } +# for article in articles +# ] +# for i in range(len(articles_dict_list)): +# articles_dict_list[i][articles[i].category] = 1 +# +# df = pd.DataFrame(articles_dict_list) +# return df +# +# def interaction_to_dataframe(interactions : list[Interaction]) -> pd.DataFrame: +# interaction_dict_list = [ +# { +# "classification_id": interaction.classification_id, +# "article_id": interaction.article_id, +# "duration_time": interaction.duration_time +# } +# for interaction in interactions +# ] +# df = pd.DataFrame(interaction_dict_list) +# return df +# +# class ArticleDataInfo: +# def __init__(self, article_id, category, created_at): +# self.article_data = pd.DataFrame( +# { +# "article_id": article_id, +# "경제 및 기업": [0], +# "정치 및 사회": [0], +# "기술 및 문화": [0], +# "스포츠 및 여가": [0], +# "오피니언 및 분석": [0], +# # "created_at": [created_at], +# } +# ) +# +# self.article_data.iloc[0][category] = 1 +# +# +# class InteractionDataInfo: +# def __init__(self, user_id, article_id, duration_time): +# self.interaction_data = pd.DataFrame( +# { +# "classification_id": [user_id], +# "article_id": [article_id], +# "duration_time": [duration_time], +# } +# ) +# +# +# class RecommendService: +# # pylint: disable=too-many-instance-attributes +# +# def __init__(self): +# self.interaction_datas = None +# self.num_classification = 5 +# +# def set_user_datas(self, user_data_path): +# self.user_data_path = user_data_path +# self.user_datas = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + user_data_path) +# +# +# async def initialize_data(self, session): +# self.set_user_datas("/./user_classification.csv") +# await self.set_article_datas(session) +# await self.set_interaction_datas(session) +# +# async def set_article_datas(self, session): +# # session = Depends(get_db_session) +# articles = await ArticleManageService().get_all_articles(session=session) +# self.article_datas = pd.get_dummies(articles_to_dataframe(articles)) +# +# async def set_interaction_datas(self, session): +# # session = Depends(get_db_session) +# interactions = await InteractionRepository().get_all(session=session) +# self.interaction_datas = interaction_to_dataframe(interactions) +# print(self.interaction_datas.columns) +# +# def make_dataset(self): +# self.user_datas = pd.get_dummies(self.user_datas) +# self.user_features_col = self.user_datas.drop( +# columns=["classification_id"] +# ).columns.values +# self.user_feat = self.user_datas.drop(columns=["classification_id"]).to_dict( +# orient="records" +# ) +# +# self.item_features = self.article_datas +# self.item_features_col = self.item_features.drop( +# columns=["article_id"] +# ).columns.values +# self.item_feat = self.item_features.drop( +# columns=["article_id"] +# ).to_dict(orient="records") +# +# self.dataset = Dataset() +# self.dataset.fit( +# users=[x for x in self.user_datas["classification_id"]], +# items=[x for x in self.article_datas["article_id"]], +# item_features=self.item_features_col, +# user_features=self.user_features_col, +# ) +# +# print(self.item_feat) +# self.item_features = self.dataset.build_item_features( +# (x, y) for x, y in zip(self.item_features["article_id"], self.item_feat) +# ) +# self.user_features = self.dataset.build_user_features( +# (x, y) for x, y in zip(self.user_datas["classification_id"], self.user_feat) +# ) +# +# (self.interactions, self.weights) = self.dataset.build_interactions( +# (x, y, z) +# for x, y, z in zip( +# self.interaction_datas["classification_id"], +# self.interaction_datas["article_id"], +# self.interaction_datas["duration_time"], +# ) +# ) +# +# num_users, num_items = self.dataset.interactions_shape() +# print("Num users: {}, num_items {}.".format(num_users, num_items)) +# +# def make_model( +# self, +# n_components: int = 30, +# loss: str = "warp", +# epoch: int = 30, +# num_thread: int = 4, +# ): +# self.n_components = n_components +# self.loss = loss +# self.epoch = epoch +# self.num_thread = num_thread +# self.model = LightFM( +# no_components=self.n_components, loss=self.loss, random_state=1616 +# ) +# +# def fit_model(self): +# self.make_dataset() +# self.make_model() +# self.model.fit( +# self.interactions, +# user_features=self.user_features, +# item_features=self.item_features, +# epochs=self.epoch, +# num_threads=self.num_thread, +# sample_weight=self.weights, +# ) +# +# def get_top_n_articles(self, user_id: int, article_num: int): +# item_ids = np.arange(self.interactions.shape[1]) # 예측할 아이템 ID 배열 +# +# predictions = self.model.predict(user_id, item_ids) +# top_items = self.article_datas.iloc[np.argsort(-predictions)[:article_num]] +# return top_items +# +# def similar_items(self, item_id, N=10): +# item_bias, item_representations = self.model.get_item_representations( +# features=self.item_features +# ) +# +# scores = item_representations.dot(item_representations[item_id, :]) +# best = np.argpartition(scores, -N)[-N:] +# +# return self.article_datas.iloc[best] +# +# async def get_classification_for_article(self, article_id:id, session:AsyncSession): +# scores = self.model.predict(np.arange(len(self.user_datas)), np.full(len(self.user_datas), article_id)) +# top_users = np.argsort(-scores) +# +# score_for_classification = [0 for _ in range(self.num_classification)] +# weight = 10 +# for user_id in top_users[:10]: +# for i in range(self.num_classification): +# score_for_classification[i] += self.user_datas.iloc[user_id][self.user_datas.columns[i+2]] * (2 ** weight) +# weight -= 1 +# +# total = sum(score_for_classification) +# for i in range(self.num_classification): +# score_for_classification[i] = (int)(score_for_classification[i] / (total/100)) +# +# await CrawledArticleRepository().set_interest_type(article_id, score_for_classification, session) +# +# return score_for_classification +# +# def get_time_weight(self, article_id): +# today = datetime.now().date() +# date_obj = datetime.strptime( +# self.article_datas[self.article_datas["article_id"] == article_id][ +# "created_at" +# ].iloc[0], +# "%Y-%m-%d", +# ).date() +# difference = today - date_obj +# return max(1 - ((difference.days // 30) / 5), 0) +# +# def fit_model_partialy(self): +# self.make_dataset() +# self.model.fit_partial(self.interactions, item_features=self.item_features) +# +# def add_interaction_data(self, interaction_data: InteractionDataInfo): +# InteractionRepository().create( +# Interaction( +# classification_id=interaction_data.interaction_data['classification_id'], +# article_id=interaction_data.interaction_data['article_id'], +# duration_time=interaction_data.interaction_data['duration_time'] +# ) +# ) From 649b2b1ee6ab341dd7536b97ea596817b024f50b Mon Sep 17 00:00:00 2001 From: stopmin Date: Mon, 15 Jul 2024 23:15:51 +0900 Subject: [PATCH 2/5] =?UTF-8?q?Dockerfile=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f94ed48..d4ab385 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ WORKDIR /app RUN apt-get update && \ apt-get install -y --no-install-recommends gcc libpq-dev libatlas-base-dev && \ - pip install --no-cache-dir pipenv + pip install --no-cache-dir pipenv \ COPY Pipfile Pipfile.lock ./ RUN pipenv install --deploy --ignore-pipfile From e8e8cf5f7fdeacd1f8d0dad48f0a06bcc567131a Mon Sep 17 00:00:00 2001 From: stopmin Date: Mon, 15 Jul 2024 23:19:24 +0900 Subject: [PATCH 3/5] =?UTF-8?q?Dockerfile=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index d4ab385..f94ed48 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,7 +4,7 @@ WORKDIR /app RUN apt-get update && \ apt-get install -y --no-install-recommends gcc libpq-dev libatlas-base-dev && \ - pip install --no-cache-dir pipenv \ + pip install --no-cache-dir pipenv COPY Pipfile Pipfile.lock ./ RUN pipenv install --deploy --ignore-pipfile From ff2f5b98a76d0bcd969e534f5f8c3f4db6189311 Mon Sep 17 00:00:00 2001 From: stopmin Date: Mon, 15 Jul 2024 23:24:33 +0900 Subject: [PATCH 4/5] =?UTF-8?q?ecs=20=ED=8C=8C=EC=9D=BC=20=EC=88=98?= =?UTF-8?q?=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/deploy-ecs.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/deploy-ecs.yml b/.github/workflows/deploy-ecs.yml index 039d826..198931a 100644 --- a/.github/workflows/deploy-ecs.yml +++ b/.github/workflows/deploy-ecs.yml @@ -132,6 +132,7 @@ jobs: if [ "$CURRENT_TASK_DEF_ARN" != "$NEW_TASK_DEF_ARN" ]; then echo "Deployment failed." exit 1 + fi - name: Post Slack Channel that Build Success if: success() From 41d2eef2be719acb73e965f066913907cd83dc00 Mon Sep 17 00:00:00 2001 From: stopmin Date: Mon, 15 Jul 2024 23:26:26 +0900 Subject: [PATCH 5/5] =?UTF-8?q?scheduling=EC=97=90=EC=84=9C=20=EC=B6=94?= =?UTF-8?q?=EC=B2=9C=EC=8B=9C=EC=8A=A4=ED=85=9C=20=EC=A3=BC=EC=84=9D?= =?UTF-8?q?=EC=B2=98=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/service/news_scheduling_service.py | 30 ++++++++++++-------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/app/service/news_scheduling_service.py b/app/service/news_scheduling_service.py index 5001da1..b7f70d6 100644 --- a/app/service/news_scheduling_service.py +++ b/app/service/news_scheduling_service.py @@ -3,13 +3,11 @@ import aiohttp import feedparser -from dotenv import load_dotenv from sqlalchemy.ext.asyncio import AsyncSession from app.config.loguru_config import logger from app.database.session import db_session from app.model.article_publisher import Publisher -from app.recommend.recommend_service import RecommendService from app.service.article_manage_service import ArticleManageService from app.service.simple_article_service import process_generate_article_by_url @@ -80,22 +78,22 @@ async def run_crawl_and_store(session: AsyncSession): else: logger.info("No new articles") - new_exist_articles = await ArticleManageService().get_all_articles(session=session) + # new_exist_articles = await ArticleManageService().get_all_articles(session=session) # 새로운 기사들만 필터링 - new_articles_id = [ - article.id - for article in new_exist_articles - if article.probability_issue_finder == -1 - ] - recommend_service = RecommendService() - await recommend_service.initialize_data(session=session) - recommend_service.fit_model() - if new_articles: - for article_id in new_articles_id: - await recommend_service.get_classification_for_article( - article_id=article_id, session=session - ) + # new_articles_id = [ + # article.id + # for article in new_exist_articles + # if article.probability_issue_finder == -1 + # ] + # recommend_service = RecommendService() + # await recommend_service.initialize_data(session=session) + # recommend_service.fit_model() + # if new_articles: + # for article_id in new_articles_id: + # await recommend_service.get_classification_for_article( + # article_id=article_id, session=session + # ) async def schedule_task():