From f00413055c490c851a43547d1a89123261e53133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maik=20Fr=C3=B6be?= Date: Wed, 25 Oct 2023 08:59:49 +0000 Subject: [PATCH] mf --- notebooks/calculate-sbert-embeddings.ipynb | 221 ++------------------- 1 file changed, 14 insertions(+), 207 deletions(-) diff --git a/notebooks/calculate-sbert-embeddings.ipynb b/notebooks/calculate-sbert-embeddings.ipynb index 2f71cebb..8fce4c0f 100644 --- a/notebooks/calculate-sbert-embeddings.ipynb +++ b/notebooks/calculate-sbert-embeddings.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "12037192-0b2a-4eea-aee7-4aca617914f5", "metadata": {}, "outputs": [ @@ -22,7 +22,7 @@ "'3.8.13 | packaged by conda-forge | (default, Mar 25 2022, 06:04:10) \\n[GCC 10.3.0]'" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -35,7 +35,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "89422d77-a927-46f1-85de-19bd9d548b03", "metadata": {}, "outputs": [], @@ -59,25 +59,19 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "a231e76c-1e8f-416f-a338-097a6c7e2b37", "metadata": {}, "outputs": [ { - "ename": "ImportError", - "evalue": "Failed to load PyTorch C extensions:\n It appears that PyTorch has loaded the `torch/_C` folder\n of the PyTorch repository rather than the C extensions which\n are expected in the `torch._C` namespace. This can occur when\n using the `install` workflow. e.g.\n $ python setup.py install && python -c \"import torch\"\n\n This error can generally be solved using the `develop` workflow\n $ python setup.py develop && python -c \"import torch\" # This should succeed\n or by running Python from a different directory.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn [3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mtransform_to_sbert_embeddings\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhello world foo\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n", - "Cell \u001b[0;32mIn [2], line 12\u001b[0m, in \u001b[0;36mtransform_to_sbert_embeddings\u001b[0;34m(input)\u001b[0m\n\u001b[1;32m 10\u001b[0m sys\u001b[38;5;241m.\u001b[39mpath \u001b[38;5;241m=\u001b[39m [VENV_LIB_DIR] \u001b[38;5;241m+\u001b[39m sys\u001b[38;5;241m.\u001b[39mpath\n\u001b[1;32m 11\u001b[0m os\u001b[38;5;241m.\u001b[39menviron[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mSENTENCE_TRANSFORMERS_HOME\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/sbert-env/sentence-transformer-home\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m---> 12\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msentence_transformers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentenceTransformer\n\u001b[1;32m 13\u001b[0m sbert_model \u001b[38;5;241m=\u001b[39m SentenceTransformer(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mparaphrase-MiniLM-L6-v2\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m json\u001b[38;5;241m.\u001b[39mdumps(sbert_model\u001b[38;5;241m.\u001b[39mencode(\u001b[38;5;28minput\u001b[39m)\u001b[38;5;241m.\u001b[39mtolist())\n", - "File \u001b[0;32m/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/sbert-env-python3.7/lib/python3.7/site-packages/sentence_transformers/__init__.py:3\u001b[0m\n\u001b[1;32m 1\u001b[0m __version__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m2.0.0\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m __MODEL_HUB_ORGANIZATION__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msentence-transformers\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentencesDataset, ParallelSentencesDataset\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mLoggingHandler\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LoggingHandler\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mSentenceTransformer\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SentenceTransformer\n", - "File \u001b[0;32m/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/sbert-env-python3.7/lib/python3.7/site-packages/sentence_transformers/datasets/__init__.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mDenoisingAutoEncoderDataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DenoisingAutoEncoderDataset\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mNoDuplicatesDataLoader\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NoDuplicatesDataLoader\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mParallelSentencesDataset\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ParallelSentencesDataset\n", - "File \u001b[0;32m/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/sbert-env-python3.7/lib/python3.7/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py:1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdata\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Dataset\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m List\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mreaders\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mInputExample\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m InputExample\n", - "File \u001b[0;32m/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/sbert-env-python3.7/lib/python3.7/site-packages/torch/__init__.py:214\u001b[0m\n\u001b[1;32m 212\u001b[0m \u001b[38;5;66;03m# The __file__ check only works for Python 3.7 and above.\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m sys\u001b[38;5;241m.\u001b[39mversion_info \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m (\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m7\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m _C_for_compiled_check\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__file__\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mImportError\u001b[39;00m(textwrap\u001b[38;5;241m.\u001b[39mdedent(\u001b[38;5;124m'''\u001b[39m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;124m Failed to load PyTorch C extensions:\u001b[39m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;124m It appears that PyTorch has loaded the `torch/_C` folder\u001b[39m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;124m of the PyTorch repository rather than the C extensions which\u001b[39m\n\u001b[1;32m 218\u001b[0m \u001b[38;5;124m are expected in the `torch._C` namespace. This can occur when\u001b[39m\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m using the `install` workflow. e.g.\u001b[39m\n\u001b[1;32m 220\u001b[0m \u001b[38;5;124m $ python setup.py install && python -c \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimport torch\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 221\u001b[0m \n\u001b[1;32m 222\u001b[0m \u001b[38;5;124m This error can generally be solved using the `develop` workflow\u001b[39m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;124m $ python setup.py develop && python -c \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mimport torch\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m # This should succeed\u001b[39m\n\u001b[1;32m 224\u001b[0m \u001b[38;5;124m or by running Python from a different directory.\u001b[39m\n\u001b[1;32m 225\u001b[0m \u001b[38;5;124m \u001b[39m\u001b[38;5;124m'''\u001b[39m)\u001b[38;5;241m.\u001b[39mstrip()) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 226\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;66;03m# If __file__ is not None the cause is unknown, so just re-raise.\u001b[39;00m\n\u001b[1;32m 229\u001b[0m __all__ \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m [name \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mdir\u001b[39m(_C)\n\u001b[1;32m 230\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m_\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[1;32m 231\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m name\u001b[38;5;241m.\u001b[39mendswith(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mBase\u001b[39m\u001b[38;5;124m'\u001b[39m)]\n", - "\u001b[0;31mImportError\u001b[0m: Failed to load PyTorch C extensions:\n It appears that PyTorch has loaded the `torch/_C` folder\n of the PyTorch repository rather than the C extensions which\n are expected in the `torch._C` namespace. This can occur when\n using the `install` workflow. e.g.\n $ python setup.py install && python -c \"import torch\"\n\n This error can generally be solved using the `develop` workflow\n $ python setup.py develop && python -c \"import torch\" # This should succeed\n or by running Python from a different directory." - ] + "data": { + "text/plain": [ + "'[-0.2869361340999603, 0.21075160801410675, 0.17087240517139435, -0.21817827224731445, 0.30960220098495483, -0.4539799094200134, 0.6447420716285706, -0.42383041977882385, 0.057523518800735474, 0.29355257749557495, -0.056013356894254684, -0.9076987504959106, 0.13708749413490295, 0.14686094224452972, 0.666681706905365, -0.11197157204151154, 0.11379440128803253, -0.5456849336624146, -0.6463279724121094, -0.5853947401046753, -0.2384648621082306, 0.05953194573521614, 0.20145022869110107, 0.18753255903720856, 0.06930533051490784, 0.08370830863714218, -0.12975147366523743, 0.19339627027511597, 0.8819394111633301, 0.22021707892417908, 0.09086017310619354, 0.23316843807697296, 0.12768423557281494, 0.13054482638835907, 0.11542203277349472, 0.5505728125572205, 0.3658863306045532, 0.21320843696594238, -0.035716790705919266, -0.31554749608039856, -0.05744757503271103, -0.47509145736694336, 0.4187689423561096, 0.1606564223766327, -0.22457638382911682, -0.11338746547698975, -0.42795515060424805, 0.14503338932991028, -0.20146667957305908, 0.17299585044384003, 0.24137599766254425, -0.012937774881720543, -0.0686570554971695, -0.063334621489048, 1.0604698657989502, 0.5236873626708984, -0.14674682915210724, 0.48384103178977966, -0.07478490471839905, -0.4421261250972748, -0.6518052816390991, -0.6490849256515503, 0.048649657517671585, 0.6000299453735352, 0.3820922374725342, 0.0434800386428833, -0.42657846212387085, 0.28174471855163574, -0.9861127138137817, -0.5990506410598755, -0.3805365562438965, 0.15456224977970123, -0.37817269563674927, 0.23916402459144592, 0.24073728919029236, -0.2600741982460022, 0.1804235279560089, -0.21535079181194305, -0.0626956894993782, 0.5741399526596069, -0.08866278827190399, 0.35312968492507935, 0.3330707550048828, -0.28344088792800903, 0.1664445549249649, -0.3495074212551117, 0.13165247440338135, -0.1116424947977066, 0.0881507396697998, -0.3525536358356476, -0.6333794593811035, -0.32296255230903625, 0.8193322420120239, 0.6472972631454468, -0.16785624623298645, 0.1662226915359497, 0.6157865524291992, -0.10184872150421143, -0.3967526853084564, 0.9837255477905273, 0.26837071776390076, 0.10525327920913696, 0.12976780533790588, 0.2718716561794281, -0.1164795532822609, 0.39679497480392456, -0.1867164671421051, 0.5642822980880737, 0.32886046171188354, 0.06143639236688614, -0.34797152876853943, -0.13181951642036438, 0.0390951931476593, -0.10480242967605591, -0.23622509837150574, -0.37797898054122925, -0.303629070520401, 0.1816607415676117, 0.24702712893486023, -0.47256970405578613, 0.06021475791931152, -0.1375226229429245, -0.2724878489971161, -0.3746926188468933, -0.8399063348770142, 0.16943144798278809, 0.7007195949554443, 0.20154938101768494, 0.18402767181396484, 0.2783129811286926, 0.08820681273937225, 0.17923657596111298, 0.25214922428131104, 0.20323698222637177, 0.08651929348707199, 0.027989601716399193, -0.2938998341560364, -0.04304143041372299, 0.039412837475538254, 0.1947362869977951, -0.3610769212245941, -0.1560358852148056, 0.40472835302352905, -0.18699786067008972, 0.20600512623786926, 0.4984048306941986, 0.2813710570335388, 0.5909144282341003, 0.10895521938800812, -0.5188388824462891, 0.2800285816192627, -0.017558271065354347, 0.051906753331422806, 0.7013188600540161, 0.11792576313018799, 0.2018057405948639, 1.370363473892212, 0.49203959107398987, 0.46435704827308655, -0.7138314247131348, -0.3956274092197418, -0.01021982729434967, -0.223377987742424, -0.13058032095432281, -0.39369043707847595, -0.40162748098373413, -0.2145557850599289, -0.02420281246304512, -0.5118784308433533, 0.4924958348274231, -0.5065075159072876, -0.13179540634155273, -0.2172427922487259, -0.01649879291653633, 0.2745550274848938, -0.8162962794303894, 0.6703075170516968, 0.749026894569397, -0.5696446299552917, 0.4742124676704407, -0.5079742670059204, -0.1440119594335556, 0.19094470143318176, 0.25761669874191284, -0.05969592183828354, 0.4444425106048584, -0.054939813911914825, 0.2351851910352707, -0.016040032729506493, -0.20256170630455017, 0.2706785500049591, -0.04696928709745407, 0.1741829365491867, 0.5208709836006165, 0.1819244772195816, -0.2258223295211792, 0.1600428968667984, -0.46301817893981934, 0.32119742035865784, -0.5146179795265198, -0.001976871397346258, 0.3720618486404419, 0.0910039097070694, -0.24144306778907776, 0.6522257924079895, 0.036948464810848236, 0.01692175306379795, 0.09105195105075836, 0.054737675935029984, 0.4085533022880554, -0.34368476271629333, 0.5428768992424011, 0.1310042440891266, -0.5313072800636292, 0.15956339240074158, -0.27638787031173706, -0.3991944193840027, -0.10450369119644165, -0.8337167501449585, 0.03768942505121231, 0.09057217836380005, -0.28318291902542114, -0.48632311820983887, -0.37988385558128357, 0.7324431538581848, -0.003458034945651889, -1.020525336265564, -0.39289528131484985, 0.21508602797985077, -0.046069879084825516, 0.4449179768562317, 0.890733540058136, -0.4023515284061432, 0.2691081464290619, 0.21368595957756042, 0.4647640287876129, 0.06767793744802475, 0.035401977598667145, -0.21628911793231964, 0.3992978632450104, 0.7985233068466187, 0.10038657486438751, -0.5779743194580078, 0.6852051615715027, -0.5251721143722534, -0.19205912947654724, -1.1571784019470215, -0.1250920593738556, -0.5508983135223389, 0.4552682042121887, 0.18564409017562866, -0.07241475582122803, -0.6578279733657837, 0.27717965841293335, 0.49311503767967224, 0.2744714617729187, -0.8566438555717468, 0.1035858616232872, -0.12466287612915039, -0.40867024660110474, -0.7187258005142212, -0.32887160778045654, -0.0804123505949974, -0.24602527916431427, 0.08117065578699112, 0.04502936080098152, -0.15664243698120117, 0.7970420718193054, -0.3507902920246124, 0.24144522845745087, -1.1732624769210815, 0.1811768114566803, -0.037474799901247025, -0.10071678459644318, -0.2220536768436432, -0.16171012818813324, 0.6124669313430786, -0.5624381303787231, -0.24034056067466736, -0.0877816453576088, -0.7241109013557434, -0.11089466512203217, 0.22268366813659668, -0.09911128133535385, -0.40810585021972656, 0.060431621968746185, 0.2710609436035156, 0.787848174571991, -0.5679223537445068, 0.015814250335097313, -0.024305563420057297, 0.8028009533882141, 0.10332636535167694, -0.6053292751312256, -0.44467368721961975, -0.15777704119682312, -0.7074443101882935, 0.20637650787830353, -0.006332471966743469, 0.1810898631811142, 0.20988821983337402, 0.02915247343480587, 0.2258332073688507, 0.24949827790260315, 0.4978683590888977, 0.5357446670532227, -0.25460073351860046, -0.40645942091941833, 0.3531019985675812, -0.4642309546470642, -0.13392587006092072, 0.35694482922554016, -0.34601250290870667, -0.2601010203361511, 0.1651051789522171, 0.4652708172798157, -1.0331456661224365, 0.7157890796661377, 0.031012415885925293, -0.039696503430604935, -0.7348887324333191, -0.053620826452970505, 0.002939158584922552, 0.010799029842019081, -0.4030298590660095, 0.1981346607208252, 0.08188919723033905, -0.8525096774101257, -0.03441426903009415, 0.4393184185028076, 0.040236204862594604, -0.02113059163093567, -0.29143479466438293, 0.01631772704422474, -0.34429851174354553, 0.1693991869688034, -0.23429055511951447, -0.012453245930373669, -0.1402885913848877, -0.02480209805071354, -0.17802028357982635, 0.15754643082618713, 0.3305588662624359, 0.04731098562479019, 0.03472224995493889, -0.18114306032657623, -0.28224191069602966, -0.3268497586250305, 0.024157419800758362, -0.22142326831817627, 0.28454098105430603, 0.8066592216491699, -0.5806676149368286, -0.31951242685317993, 0.2991618514060974, 0.07273989915847778, -0.1747395545244217, -0.03688020631670952, 0.03934355825185776, 0.02106265351176262, 0.7195046544075012, 0.1962379515171051, 0.201066255569458, -0.27456849813461304, -0.12730291485786438, 0.41243067383766174, 0.11243561655282974, -0.14202934503555298, -0.5124541521072388, -0.41919589042663574, -0.4419211745262146, -0.3028791844844818, 0.3192686438560486, -0.2620718479156494, 0.02036193385720253, 0.1418408304452896, -0.353306382894516, 0.13468030095100403, 0.12894940376281738, 0.670450747013092, 0.6815897226333618, -0.16320526599884033, 0.2503185570240021, -0.11151257902383804]'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -206,194 +200,7 @@ "\n", "sc.parallelize(queries, 10000)\\\n", " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-ms-marco-train.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cfb67337-1425-4625-a9ba-d960b69652c1", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250it [00:00, 38687.13it/s]\n" - ] - } - ], - "source": [ - "robust04_desc_queries = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/robust04-descriptions.jsonl')\n", - "\n", - "sc.parallelize(robust04_desc_queries, 100)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-robust04-descriptions.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f31203bd-6631-45e3-81ba-18254d722d65", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "3151it [00:00, 108014.74it/s]\n" - ] - } - ], - "source": [ - "robust04_formulation_queries = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/culpepper-robust04-query-formulations.jsonl')\n", - "\n", - "sc.parallelize(robust04_formulation_queries, 100)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/culpepper-robust04-query-formulations.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "f92e5c00-8dc0-4a2f-93f7-9e05ae681e5e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "250it [00:00, 41929.62it/s]\n" - ] - } - ], - "source": [ - "robust04_queries = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/trec-robust04.jsonl')\n", - "\n", - "sc.parallelize(robust04_queries, 10)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-trec-robust04.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "99aafba2-b9fb-42fb-b96c-b6a3f2c7958f", - "metadata": {}, - "outputs": [], - "source": [ - "sc.textFile('file:///mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/msmarco-document-orcas.jsonl')\\\n", - " .repartition(100)\\\n", - " .saveAsTextFile('ecir22/original-ms-marco-orcas.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "312a8d54-0d0a-4e8b-bca5-5f35a907e2c8", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1505it [00:00, 71136.36it/s]\n", - " \r" - ] - } - ], - "source": [ - "trec18_reformulations = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/culpepper-trec18-query-formulations.jsonl')\n", - "\n", - "sc.parallelize(trec18_reformulations, 10)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-trec18-reformulations.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "95ba9d06-0cdf-4945-aa35-24e40bd95fae", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "50it [00:00, 10988.48it/s]\n", - " \r" - ] - } - ], - "source": [ - "trec18_reformulations = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/wapo-core-2018-descriptions.jsonl')\n", - "\n", - "sc.parallelize(trec18_reformulations, 10)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-trec18-descriptions.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "230170c7-db65-443a-8019-a6523e8f5121", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "50it [00:00, 12575.12it/s]\n", - " \r" - ] - } - ], - "source": [ - "trec18_reformulations = load_queries('/mnt/ceph/storage/data-in-progress/data-research/web-search/ECIR-22/ecir22-zero-shot/wapo-core-2018-titles.jsonl')\n", - "\n", - "sc.parallelize(trec18_reformulations, 10)\\\n", - " .map(append_sbert_embedding)\\\n", - " .saveAsTextFile('ecir22/embeddings-trec18-titles.jsonl')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6dd83a4a-fe0c-4f0f-aaaf-156419bdb305", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Process \"ecir22/original-ms-marco-orcas.jsonl/part-*0\" and write to \"ecir22/embeddings-ms-marco-orcas/part-0.jsonl\".\n", - "Process \"ecir22/original-ms-marco-orcas.jsonl/part-*1\" and write to \"ecir22/embeddings-ms-marco-orcas/part-1.jsonl\".\n", - "Process \"ecir22/original-ms-marco-orcas.jsonl/part-*2\" and write to \"ecir22/embeddings-ms-marco-orcas/part-2.jsonl\".\n" - ] - } - ], - "source": [ - "import json\n", - "\n", - "for part in range(10):\n", - " input_file = 'ecir22/original-ms-marco-orcas.jsonl/part-*' + str(part)\n", - " output_file = 'ecir22/embeddings-ms-marco-orcas/part-' + str(part) + '.jsonl'\n", - " \n", - " print('Process \"' + input_file + '\" and write to \"' + output_file +'\".')\n", - " sc.textFile(input_file)\\\n", - " .repartition(10000) \\\n", - " .map(lambda i: append_sbert_embedding(json.loads(i)))\\\n", - " .saveAsTextFile(output_file)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d9ce53ec-3829-49f1-bb9a-3fb204fdaadc", - "metadata": {}, - "outputs": [], - "source": [ - "# Run ORCAS simply via writing to hdfs (with 100 parts) and combining all parts again" + " .saveAsTextFile(todo-find-directory-in-aql)" ] }, { @@ -405,7 +212,7 @@ { "data": { "text/plain": [ - "'[-0.08469943702220917, 0.4564870297908783, 0.10086822509765625, -0.11005697399377823, 0.04533920809626579, -0.3661494851112366, 0.3580332398414612, -0.07830145955085754, -0.24362005293369293, 0.2736354470252991, 0.10803011804819107, -0.7733525633811951, -0.13244451582431793, 0.19357416033744812, 0.20357149839401245, -0.36997082829475403, 0.3396165668964386, -0.6763170957565308, -0.849643886089325, -0.19371700286865234, -0.230658158659935, -0.02822048030793667, 0.34298205375671387, 0.20185132324695587, 0.24069765210151672, -0.27264273166656494, 0.07051801681518555, 0.28514087200164795, 0.27905553579330444, -0.010386943817138672, -0.011181384325027466, 0.04527701437473297, 0.24535013735294342, -0.020689629018306732, 0.047592952847480774, 0.48722851276397705, 0.1144338995218277, -0.12156331539154053, -0.15390871465206146, -0.11076535284519196, 0.29678648710250854, -0.4554610550403595, 0.11478982865810394, 0.06491449475288391, -0.2805788516998291, -0.005584864877164364, -0.03243638575077057, 0.19312036037445068, -0.025492660701274872, 0.10158758610486984, 0.00930526852607727, -0.20151090621948242, 0.1436285376548767, -0.0898931473493576, 0.7997888326644897, 0.19379879534244537, 0.16400004923343658, 0.5003919005393982, 0.04844425618648529, -0.4123835861682892, -0.41374486684799194, -0.47276395559310913, -0.025326386094093323, 0.8419860601425171, 0.1620156168937683, 0.1532735824584961, -0.2606377601623535, 0.43784016370773315, -0.7216449975967407, -0.9117643237113953, -0.3602350950241089, -0.06881894171237946, -0.048498839139938354, 0.19168256223201752, 0.23297375440597534, -0.2777986526489258, 0.37963008880615234, -0.1429784893989563, 0.1916062831878662, 0.5609490275382996, 0.4084661602973938, -0.05393568426370621, -0.010461628437042236, -0.5830550193786621, 0.4845767617225647, -0.113331139087677, 0.1029115840792656, 0.17454679310321808, 0.04227719455957413, -0.6043915152549744, -0.7394313812255859, 0.2791154980659485, 0.7246915102005005, 0.43350058794021606, 0.012097332626581192, 0.1530357003211975, 0.31137943267822266, 0.0488445982336998, -0.6617786884307861, 1.5698521137237549, 0.36193904280662537, 0.148813396692276, 0.49641990661621094, 0.3448859751224518, -0.24441739916801453, 0.47320544719696045, -0.2631124258041382, 0.4993917644023895, 0.05329512059688568, 0.11406902968883514, -0.22877034544944763, -0.04336903244256973, -0.20341789722442627, 0.10597320646047592, 0.3737863302230835, 0.061443716287612915, 0.26065677404403687, 0.4802852272987366, 0.33708709478378296, -0.6553970575332642, 0.01377028226852417, -0.28922921419143677, -0.02539418265223503, -0.18594302237033844, -0.6735166907310486, -0.024639854207634926, 0.4803006649017334, 0.27536144852638245, 0.1822444498538971, 0.29827627539634705, -0.1919078528881073, 0.5343491435050964, -0.008797001093626022, 0.3069656491279602, -0.21810799837112427, -0.1038127988576889, -0.1398477554321289, -0.14189736545085907, -0.15162506699562073, 0.061655521392822266, -0.3548749089241028, 0.31456834077835083, 0.10652866214513779, -0.23747001588344574, 0.181940957903862, 0.5393760800361633, 0.29171961545944214, 0.5020653009414673, -0.012960508465766907, -0.3500747084617615, 0.1951698660850525, 0.007279247045516968, 0.009370051324367523, 0.4073788821697235, 0.25481605529785156, -0.10485465079545975, 1.185978651046753, 0.2828645706176758, 0.16491293907165527, -0.4207371771335602, -0.2507503032684326, -0.04462968185544014, -0.15479208528995514, -0.15878266096115112, -0.49950772523880005, -0.401295006275177, -0.28619885444641113, -0.11829467862844467, -0.37281930446624756, 0.17397359013557434, -0.487634539604187, -0.056973278522491455, 0.0026283860206604004, 0.0644322857260704, 0.21376872062683105, -0.3212123513221741, 0.8684197664260864, 0.7110563516616821, -0.2682185769081116, 0.1602502018213272, -0.8188387155532837, -0.31392616033554077, 0.21196943521499634, 0.27693450450897217, 0.10476401448249817, 0.08666456490755081, -0.05256887525320053, 0.01926770806312561, -0.21478891372680664, -0.21563410758972168, 0.014068856835365295, -0.42710888385772705, 0.2956087589263916, 0.2767024338245392, 0.12713828682899475, 0.10711798071861267, -0.3153403699398041, -0.7022724151611328, 0.04825931787490845, -0.28940823674201965, 0.15990160405635834, 0.2861554026603699, 0.3513185977935791, -0.06886308640241623, 0.6245968341827393, -0.29239535331726074, 0.026951082050800323, 0.11437326669692993, -0.009004540741443634, 0.8237249255180359, -0.12523536384105682, 0.24733461439609528, 0.13817434012889862, -0.3686709403991699, -0.06106532737612724, -0.12520039081573486, -0.24754446744918823, 0.19010043144226074, -0.7401747703552246, 0.23279879987239838, 0.393762469291687, -0.11006142199039459, -0.18028472363948822, -0.29953521490097046, 0.2426399290561676, -0.2433745414018631, -0.8143547177314758, -0.004466347396373749, 0.423747181892395, 0.06280062347650528, 0.2616772949695587, 1.0482226610183716, -0.3863975703716278, 0.4188492000102997, -0.02908133715391159, 0.1897907257080078, 0.24139191210269928, 0.13234248757362366, -0.3460733890533447, 0.05999527871608734, 0.6545636057853699, -0.021433144807815552, -0.3530394732952118, 0.42895081639289856, 0.00978078693151474, 0.09419979155063629, -1.0371383428573608, 0.11786166578531265, -0.6087011694908142, 0.6397134065628052, 0.04840543121099472, -0.1515769362449646, -0.6389087438583374, 0.11881941556930542, 0.07600940763950348, 0.25367462635040283, -0.6087325811386108, -0.13956084847450256, -0.04484392702579498, 0.4737672507762909, -0.5100858211517334, -0.5746822953224182, -0.47108083963394165, -0.08608388155698776, -2.3562461137771606e-05, 0.1184471920132637, -0.16412049531936646, 0.5381180047988892, -0.3951112627983093, -0.10469421744346619, -1.051822543144226, 0.11303254961967468, -0.15133926272392273, -0.07616102695465088, -0.18919247388839722, -0.02349192649126053, 0.43806013464927673, -0.33359280228614807, -0.15172746777534485, 0.220516175031662, -0.5247753858566284, -0.43345874547958374, 0.6387717127799988, 0.23654885590076447, -0.42217737436294556, 0.006013799458742142, -0.20688079297542572, 0.4139860272407532, -0.34582090377807617, 0.16556745767593384, 0.326760858297348, 0.693403959274292, 0.015689969062805176, -0.28676655888557434, -0.15708279609680176, -0.13936087489128113, -0.5055321455001831, -0.3429078459739685, 0.2088516801595688, 0.1658327430486679, -0.20893405377864838, 0.004718473181128502, 0.14146432280540466, -0.23770689964294434, 0.3338397145271301, 0.12973427772521973, -0.28573861718177795, -0.24765203893184662, 0.4254816174507141, -0.228287011384964, -0.2209286391735077, 0.20925205945968628, 0.13094007968902588, -0.050192397087812424, -0.013185754418373108, 0.2081996202468872, -0.5907852649688721, 0.30473050475120544, -0.04172981157898903, 0.0751330554485321, -0.49397754669189453, 0.3945575952529907, 0.027982477098703384, -0.032344989478588104, -0.459894597530365, 0.12571093440055847, -0.10395792126655579, -0.7523688673973083, -0.21520160138607025, 0.3537065386772156, -0.13149745762348175, -0.40234827995300293, -0.19095051288604736, 0.22419317066669464, -0.030423685908317566, 0.24737782776355743, -0.18795841932296753, -0.06761309504508972, -0.1787666231393814, -0.2439592033624649, -0.04282044619321823, -0.36860018968582153, 0.29594144225120544, 0.06608099490404129, 0.38329464197158813, -0.02986355498433113, -0.22199460864067078, -0.1492568850517273, 0.17321163415908813, -0.29076892137527466, 0.2355531007051468, 0.7190690040588379, -0.5451163053512573, -0.2757304608821869, 0.13347113132476807, -0.20016297698020935, -0.1607765108346939, -0.0661139115691185, -0.1851297914981842, -0.5091961026191711, 0.09560316801071167, 0.21853843331336975, 0.2611771821975708, -0.0001680031418800354, 0.011891904287040234, 0.5450519323348999, 0.4631463885307312, -0.37412595748901367, -0.3584563732147217, -0.024205949157476425, -0.3381759524345398, -0.31798043847084045, 0.3382137417793274, -0.34947019815444946, -0.184756338596344, -0.010816067457199097, -0.11279987543821335, 0.16977839171886444, -0.017699621617794037, 0.51512610912323, 0.3195386826992035, -0.6560264825820923, -0.36865824460983276, 0.14153510332107544]'" + "'[-0.08469964563846588, 0.45648717880249023, 0.1008683294057846, -0.11005671322345734, 0.04533912613987923, -0.36614954471588135, 0.35803312063217163, -0.0783013254404068, -0.24361993372440338, 0.27363526821136475, 0.1080302745103836, -0.7733523845672607, -0.1324445903301239, 0.19357404112815857, 0.20357096195220947, -0.3699706792831421, 0.33961641788482666, -0.6763169765472412, -0.849643886089325, -0.19371655583381653, -0.2306578904390335, -0.028220511972904205, 0.34298181533813477, 0.20185144245624542, 0.24069730937480927, -0.2726427912712097, 0.0705178901553154, 0.28514111042022705, 0.279055655002594, -0.010387003421783447, -0.011181145906448364, 0.04527720436453819, 0.24535025656223297, -0.020689554512500763, 0.047593019902706146, 0.4872283935546875, 0.11443371325731277, -0.12156311422586441, -0.153908833861351, -0.11076533794403076, 0.29678651690483093, -0.45546096563339233, 0.11479000747203827, 0.06491468846797943, -0.28057861328125, -0.005584833212196827, -0.03243649750947952, 0.1931203007698059, -0.025492094457149506, 0.10158748924732208, 0.009304866194725037, -0.20151078701019287, 0.1436285823583603, -0.08989334851503372, 0.7997884154319763, 0.19379885494709015, 0.16400006413459778, 0.5003917217254639, 0.04844442754983902, -0.41238343715667725, -0.41374480724334717, -0.47276341915130615, -0.02532695233821869, 0.841985821723938, 0.16201576590538025, 0.15327340364456177, -0.2606378197669983, 0.43784022331237793, -0.7216446399688721, -0.9117646217346191, -0.36023467779159546, -0.06881903111934662, -0.04849867522716522, 0.19168247282505035, 0.23297397792339325, -0.27779829502105713, 0.3796299695968628, -0.14297834038734436, 0.19160625338554382, 0.5609489679336548, 0.4084658920764923, -0.05393606051802635, -0.010461535304784775, -0.5830548405647278, 0.4845765233039856, -0.11333071440458298, 0.10291182994842529, 0.17454707622528076, 0.04227717965841293, -0.6043916940689087, -0.7394312620162964, 0.27911561727523804, 0.7246910333633423, 0.4335004985332489, 0.012097114697098732, 0.15303562581539154, 0.3113793134689331, 0.048844512552022934, -0.6617786288261414, 1.5698521137237549, 0.3619391918182373, 0.1488133668899536, 0.4964197874069214, 0.3448866009712219, -0.24441763758659363, 0.4732050597667694, -0.2631121575832367, 0.49939170479774475, 0.05329473316669464, 0.11406904458999634, -0.2287701666355133, -0.04336905851960182, -0.20341816544532776, 0.10597343742847443, 0.3737866282463074, 0.06144373118877411, 0.2606566250324249, 0.48028498888015747, 0.33708706498146057, -0.6553972959518433, 0.013770297169685364, -0.2892290949821472, -0.025394435971975327, -0.18594303727149963, -0.673516571521759, -0.024639636278152466, 0.4803009033203125, 0.2753613591194153, 0.18224447965621948, 0.29827597737312317, -0.19190770387649536, 0.5343493819236755, -0.008796986192464828, 0.30696555972099304, -0.2181081920862198, -0.10381292551755905, -0.13984766602516174, -0.1418972909450531, -0.15162532031536102, 0.06165549159049988, -0.3548746109008789, 0.31456825137138367, 0.10652821511030197, -0.23747017979621887, 0.18194086849689484, 0.539375901222229, 0.29171907901763916, 0.5020653009414673, -0.012960664927959442, -0.3500745892524719, 0.19516970217227936, 0.0072794631123542786, 0.009369991719722748, 0.4073788821697235, 0.2548157274723053, -0.10485469549894333, 1.1859781742095947, 0.282864511013031, 0.1649128943681717, -0.42073702812194824, -0.25075018405914307, -0.044629622250795364, -0.1547919064760208, -0.15878251194953918, -0.4995075762271881, -0.40129488706588745, -0.2861984372138977, -0.11829471588134766, -0.372819185256958, 0.17397379875183105, -0.4876343607902527, -0.056973375380039215, 0.002628237009048462, 0.06443263590335846, 0.21376870572566986, -0.3212122917175293, 0.8684196472167969, 0.7110564708709717, -0.26821866631507874, 0.1602502316236496, -0.8188390731811523, -0.31392621994018555, 0.21196937561035156, 0.2769344449043274, 0.10476375371217728, 0.08666462451219559, -0.05256888270378113, 0.019267655909061432, -0.21478864550590515, -0.21563377976417542, 0.014068901538848877, -0.4271088242530823, 0.2956088185310364, 0.2767024636268616, 0.12713822722434998, 0.10711778700351715, -0.31534022092819214, -0.7022720575332642, 0.04825911670923233, -0.2894078493118286, 0.15990173816680908, 0.28615596890449524, 0.3513187766075134, -0.06886336207389832, 0.6245967149734497, -0.2923956513404846, 0.026951052248477936, 0.11437344551086426, -0.009004596620798111, 0.8237249255180359, -0.12523512542247772, 0.24733443558216095, 0.13817456364631653, -0.36867082118988037, -0.06106507033109665, -0.1252005249261856, -0.24754440784454346, 0.19010043144226074, -0.7401745319366455, 0.23279890418052673, 0.3937625288963318, -0.11006136238574982, -0.18028445541858673, -0.2995349168777466, 0.24263998866081238, -0.24337467551231384, -0.8143541812896729, -0.004466548562049866, 0.42374736070632935, 0.0628005787730217, 0.26167750358581543, 1.0482228994369507, -0.38639727234840393, 0.41884884238243103, -0.02908148616552353, 0.18979047238826752, 0.24139204621315002, 0.1323426514863968, -0.3460729420185089, 0.05999540165066719, 0.6545634269714355, -0.02143344283103943, -0.35303887724876404, 0.42895081639289856, 0.009781084954738617, 0.09419995546340942, -1.0371383428573608, 0.11786141991615295, -0.6087008714675903, 0.6397130489349365, 0.04840565472841263, -0.1515766680240631, -0.6389090418815613, 0.11881940066814423, 0.07600889354944229, 0.2536744773387909, -0.6087324023246765, -0.13956038653850555, -0.044844165444374084, 0.47376748919487, -0.5100854635238647, -0.5746822953224182, -0.4710806608200073, -0.0860837996006012, -2.354755997657776e-05, 0.11844732612371445, -0.1641201376914978, 0.5381179451942444, -0.39511072635650635, -0.10469445586204529, -1.0518221855163574, 0.11303244531154633, -0.15133944153785706, -0.07616110146045685, -0.18919262290000916, -0.023491807281970978, 0.4380597472190857, -0.3335927128791809, -0.1517273634672165, 0.2205161303281784, -0.5247754454612732, -0.433458536863327, 0.6387712359428406, 0.23654891550540924, -0.42217734456062317, 0.0060136038810014725, -0.20688077807426453, 0.4139856696128845, -0.34582120180130005, 0.16556726396083832, 0.3267609179019928, 0.6934036016464233, 0.015689723193645477, -0.2867664694786072, -0.1570829302072525, -0.13936114311218262, -0.5055323243141174, -0.34290793538093567, 0.2088516652584076, 0.16583292186260223, -0.2089342325925827, 0.0047185979783535, 0.14146411418914795, -0.2377070188522339, 0.3338395953178406, 0.1297343373298645, -0.28573867678642273, -0.2476518750190735, 0.42548227310180664, -0.22828689217567444, -0.2209286093711853, 0.2092519849538803, 0.13094036281108856, -0.050192587077617645, -0.013185873627662659, 0.20819984376430511, -0.590785026550293, 0.3047303259372711, -0.04172978922724724, 0.07513344287872314, -0.49397721886634827, 0.394557923078537, 0.027982164174318314, -0.03234529495239258, -0.4598943889141083, 0.12571129202842712, -0.10395786166191101, -0.7523690462112427, -0.21520176529884338, 0.3537064790725708, -0.1314971148967743, -0.40234822034835815, -0.19095030426979065, 0.2241930365562439, -0.030423402786254883, 0.24737724661827087, -0.18795877695083618, -0.06761309504508972, -0.1787668764591217, -0.24395930767059326, -0.042820610105991364, -0.36860090494155884, 0.2959412932395935, 0.06608089804649353, 0.38329482078552246, -0.029863718897104263, -0.22199447453022003, -0.14925692975521088, 0.17321136593818665, -0.2907690107822418, 0.23555323481559753, 0.7190688252449036, -0.5451159477233887, -0.2757304906845093, 0.13347133994102478, -0.20016296207904816, -0.16077686846256256, -0.06611384451389313, -0.18512988090515137, -0.5091965794563293, 0.09560314565896988, 0.21853826940059662, 0.2611772119998932, -0.00016783177852630615, 0.011892130598425865, 0.5450518131256104, 0.46314629912376404, -0.3741258680820465, -0.35845592617988586, -0.0242062509059906, -0.3381757140159607, -0.31798040866851807, 0.3382136821746826, -0.3494700491428375, -0.18475615978240967, -0.010816290974617004, -0.11279979348182678, 0.16977845132350922, -0.01769975572824478, 0.5151262283325195, 0.3195383846759796, -0.6560266017913818, -0.3686582148075104, 0.14153507351875305]'" ] }, "execution_count": 6,