-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathali_twitter_final_read_data.json
1 lines (1 loc) · 12.9 KB
/
ali_twitter_final_read_data.json
1
{"paragraphs":[{"title":"Import Libraries","text":"%pyspark\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport pandas as pd\n\nfrom pyspark.sql import SQLContext\nfrom pyspark.sql import functions as F\nfrom pyspark.sql import types as T\nfrom pyspark.sql import Row\nfrom pyspark.sql.window import Window\n\nfrom transformers import BertTokenizer\ntokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)\n\[email protected](\"String\")\ndef decode_tokens(tokens):\n return tokenizer.decode(tokens)\n\nsqc = SQLContext(sc)","user":"anonymous","dateUpdated":"2020-07-03T14:25:17+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034018844_1691767909","id":"20200613-074018_765922500","dateCreated":"2020-06-13T07:40:18+0000","dateStarted":"2020-07-03T14:25:17+0000","dateFinished":"2020-07-03T14:25:18+0000","status":"FINISHED","progressUpdateIntervalMs":500,"focus":true,"$$hashKey":"object:7142"},{"title":"Schema","text":"%pyspark\nfields = list()\nfields.append(T.StructField(\"text_tokens\", T.StringType(), True))\nfields.append(T.StructField(\"hashtags\", T.StringType(), True))\nfields.append(T.StructField(\"tweet_id\", T.StringType(), True))\nfields.append(T.StructField(\"present_media\", T.StringType(), True))\nfields.append(T.StructField(\"present_links\", T.StringType(), True))\nfields.append(T.StructField(\"present_domains\", T.StringType(), True))\nfields.append(T.StructField(\"tweet_type\", T.StringType(), True))\nfields.append(T.StructField(\"language\", T.StringType(), True))\nfields.append(T.StructField(\"timestamp\", T.LongType(), True))\n\nfields.append(T.StructField(\"engager_user_id\", T.StringType(), True))\nfields.append(T.StructField(\"engager_follower_count\", T.LongType(), True))\nfields.append(T.StructField(\"engager_following_count\", T.LongType(), True))\nfields.append(T.StructField(\"engager_is_verified\", T.BooleanType(), True))\nfields.append(T.StructField(\"engager_account_creation_time\", T.LongType(), True))\n\nfields.append(T.StructField(\"engagee_user_id\", T.StringType(), True))\nfields.append(T.StructField(\"engagee_follower_count\", T.LongType(), True))\nfields.append(T.StructField(\"engagee_following_count\", T.LongType(), True))\nfields.append(T.StructField(\"engagee_is_verified\", T.BooleanType(), True))\nfields.append(T.StructField(\"engagee_account_creation_time\", T.LongType(), True))\n\nfields.append(T.StructField(\"engagee_follows_engager\", T.BooleanType(), True))\nfields.append(T.StructField(\"reply_engagement_timestamp\", T.LongType(), True))\nfields.append(T.StructField(\"retweet_engagement_timestamp\", T.LongType(), True))\nfields.append(T.StructField(\"retweet_with_comment_engagement_timestamp\", T.LongType(), True))\nfields.append(T.StructField(\"like_engagement_timestamp\", T.LongType(), True))\nschema = T.StructType(fields)","user":"anonymous","dateUpdated":"2020-06-13T07:54:19+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034031231_432091103","id":"20200613-074031_267186397","dateCreated":"2020-06-13T07:40:31+0000","dateStarted":"2020-06-13T07:54:19+0000","dateFinished":"2020-06-13T07:54:19+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7143"},{"title":"File Paths","text":"%pyspark\ntraining_file_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/training.tsv\"\nvalidation_file_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/val.tsv\"\ntest_file_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/competition_test.tsv\"","user":"anonymous","dateUpdated":"2020-07-03T14:25:20+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034299071_47189920","id":"20200613-074459_1883007330","dateCreated":"2020-06-13T07:44:59+0000","dateStarted":"2020-07-03T14:25:20+0000","dateFinished":"2020-07-03T14:25:20+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7144"},{"title":"Reading training data","text":"%pyspark\ntraining_df = sqc.read.option(\"sep\", chr(1)).schema(schema).csv(training_file_path)\ntraining_df = training_df.withColumn(\"text_tokens\", F.split(\"text_tokens\", \"\\t\"))\ntraining_df = training_df.withColumn(\"hashtags\", F.split(\"hashtags\", \"\\t\"))\ntraining_df = training_df.withColumn(\"present_media\", F.split(\"present_media\", \"\\t\"))\ntraining_df = training_df.withColumn(\"present_links\", F.split(\"present_links\", \"\\t\"))\ntraining_df = training_df.withColumn(\"present_domains\", F.split(\"present_domains\", \"\\t\"))\n# training_df.printSchema()\n# training_df.show()","user":"anonymous","dateUpdated":"2020-06-13T07:54:25+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034210560_-1274818469","id":"20200613-074330_469957860","dateCreated":"2020-06-13T07:43:30+0000","dateStarted":"2020-06-13T07:54:25+0000","dateFinished":"2020-06-13T07:54:25+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7145"},{"title":"Read validation data","text":"%pyspark\nvalidation_df = sqc.read.option(\"sep\", chr(1)).schema(schema).csv(validation_file_path)\nvalidation_df = validation_df.withColumn(\"text_tokens\", F.split(\"text_tokens\", \"\\t\"))\nvalidation_df = validation_df.withColumn(\"hashtags\", F.split(\"hashtags\", \"\\t\"))\nvalidation_df = validation_df.withColumn(\"present_media\", F.split(\"present_media\", \"\\t\"))\nvalidation_df = validation_df.withColumn(\"present_links\", F.split(\"present_links\", \"\\t\"))\nvalidation_df = validation_df.withColumn(\"present_domains\", F.split(\"present_domains\", \"\\t\"))\n\n# validation_df.show()","user":"anonymous","dateUpdated":"2020-06-13T07:54:27+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034211239_-16968555","id":"20200613-074331_1889803118","dateCreated":"2020-06-13T07:43:31+0000","dateStarted":"2020-06-13T07:54:27+0000","dateFinished":"2020-06-13T07:54:27+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7146"},{"title":"Read test data","text":"%pyspark\ntest_df = sqc.read.option(\"sep\", chr(1)).schema(schema).csv(test_file_path)\ntest_df = test_df.withColumn(\"text_tokens\", F.split(\"text_tokens\", \"\\t\"))\ntest_df = test_df.withColumn(\"hashtags\", F.split(\"hashtags\", \"\\t\"))\ntest_df = test_df.withColumn(\"present_media\", F.split(\"present_media\", \"\\t\"))\ntest_df = test_df.withColumn(\"present_links\", F.split(\"present_links\", \"\\t\"))\ntest_df = test_df.withColumn(\"present_domains\", F.split(\"present_domains\", \"\\t\"))\n# test_df.printSchema()","user":"anonymous","dateUpdated":"2020-06-13T07:54:53+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034211503_2104871322","id":"20200613-074331_2090659351","dateCreated":"2020-06-13T07:43:31+0000","dateStarted":"2020-06-13T07:54:53+0000","dateFinished":"2020-06-13T07:54:53+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7147"},{"title":"Parquet file path","text":"%pyspark\ntraining_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/training_df\"\nvalidation_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/val_df\"\ntest_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/test_df\"","user":"anonymous","dateUpdated":"2020-06-13T07:54:57+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034528822_2026900320","id":"20200613-074848_1848279721","dateCreated":"2020-06-13T07:48:48+0000","dateStarted":"2020-06-13T07:54:57+0000","dateFinished":"2020-06-13T07:54:57+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7148"},{"title":"save data as parquet","text":"%pyspark\ntraining_df.write.parquet(training_parquet_path)\nvalidation_df.write.parquet(validation_parquet_path)\ntest_df.write.parquet(test_parquet_path)","user":"anonymous","dateUpdated":"2020-06-13T07:56:39+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034211890_804401462","id":"20200613-074331_1101289609","dateCreated":"2020-06-13T07:43:31+0000","dateStarted":"2020-06-13T07:56:39+0000","dateFinished":"2020-06-13T08:17:40+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7149"},{"title":"Read dataframes","text":"%pyspark\ntraining_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/training_df\"\nvalidation_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/val_df\"\ntest_parquet_path = \"/nas_ssd_social_media_analytics/ali_twitter/final_dataset_12062020/temp/test_df\"\n\ntraining_df = sqc.read.parquet(training_parquet_path)\nvalidation_df = sqc.read.parquet(validation_parquet_path)\ntest_df = sqc.read.parquet(test_parquet_path)","user":"anonymous","dateUpdated":"2020-07-03T14:25:32+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python","title":true},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[]},"apps":[],"jobName":"paragraph_1592034980465_154632120","id":"20200613-075620_2080256236","dateCreated":"2020-06-13T07:56:20+0000","dateStarted":"2020-07-03T14:25:32+0000","dateFinished":"2020-07-03T14:25:34+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7150"},{"text":"%pyspark\nprint(training_df.count())\nprint(validation_df.count())\nprint(test_df.count())","user":"anonymous","dateUpdated":"2020-07-03T14:25:39+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python"},"settings":{"params":{},"forms":{}},"results":{"code":"SUCCESS","msg":[{"type":"TEXT","data":"121386431\n12434735\n12434838\n"}]},"apps":[],"jobName":"paragraph_1592036386459_-1619460297","id":"20200613-081946_349610833","dateCreated":"2020-06-13T08:19:46+0000","dateStarted":"2020-07-03T14:25:39+0000","dateFinished":"2020-07-03T14:25:42+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:7151"},{"text":"%pyspark\n","user":"anonymous","dateUpdated":"2020-07-03T14:26:05+0000","config":{"colWidth":12,"fontSize":9,"enabled":true,"results":{},"editorSetting":{"language":"python","editOnDblClick":false,"completionKey":"TAB","completionSupport":true},"editorMode":"ace/mode/python"},"settings":{"params":{},"forms":{}},"apps":[],"jobName":"paragraph_1593786365205_-892316126","id":"20200703-142605_865930413","dateCreated":"2020-07-03T14:26:05+0000","status":"READY","progressUpdateIntervalMs":500,"$$hashKey":"object:7152"}],"name":"ali/twitter/final/read_data","id":"2FBVBHWNW","noteParams":{},"noteForms":{},"angularObjects":{"md:shared_process":[],"sh:shared_process":[],"spark:shared_process":[]},"config":{"isZeppelinNotebookCronEnable":false,"looknfeel":"default","personalizedMode":"false"},"info":{}}