master

rese1f · Oct 30, 2023 · 6537a3f · 6537a3f
1 parent 2b537af
commit 6537a3f
Show file tree

Hide file tree

Showing 78 changed files with 19,552 additions and 41 deletions.
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,7 @@ MANIFEST
 data/
 ckpt/
 *.sh
+output/
+scripts/
+dataset/
+src/
diff --git a/MovieChat/models/moviechat.py b/MovieChat/models/moviechat.py
@@ -77,7 +77,7 @@ def __init__(
         fusion_head_layers = 2,
         num_video_query_token = 32,
         short_memory_length = 18,
-        long_memory_length = 64,
+        long_memory_length = 256,
         short_memory_merge = 2,
         Qformer_input = 8
     ):
@@ -292,7 +292,6 @@ def encode_short_memory_frame(self, videofragment, n_frame:int = 16):
                     similar_list.append(frame_silimar)
 
             for frame in self.short_memory_buffer:
-
                 self.long_memory_buffer.append(frame)
 
     def encode_long_video(self, cur_image, middle_video:False):
@@ -303,7 +302,7 @@ def encode_long_video(self, cur_image, middle_video:False):
         self.long_memory_buffer = [i.unsqueeze(0) for i in self.long_memory_buffer]
 
         # expand position embedding
-        n_position = 8
+        n_position = 16
         position_ids = torch.arange(n_position).long().to(self.query_tokens.device)
         position_ids = position_ids.unsqueeze(0).expand(batch_size, -1) 
         p = self.video_frame_position_embedding(position_ids).squeeze(0)
@@ -326,22 +325,24 @@ def encode_long_video(self, cur_image, middle_video:False):
         frame_position_embeddings = torch.cat(frame_position_embeddings, dim = 0)
 
         if middle_video:
-            cur_long_length = len(self.long_memory_buffer)
-            cur_short_length = len(self.temp_short_memory)
-
-            while (cur_long_length+cur_short_length+1) > self.max_frame_pos:
-                self.temp_short_memory.pop(0)
+            while (len(self.long_memory_buffer)+len(self.temp_short_memory)+1) > frame_position_embeddings.shape[0]:
+                if len(self.temp_short_memory) != 0:
+                    self.temp_short_memory.pop(0)
+                else:
+                    self.long_memory_buffer.pop(0)
 
             if len(self.long_memory_buffer) == 0:
                 self.temp_short_memory = [i.unsqueeze(0) for i in self.temp_short_memory]
                 cur_short = torch.cat(self.temp_short_memory, dim = 0)
-                video_features = torch.cat([video_features, cur_image], dim = 0)
+                video_features = torch.cat([cur_short], dim = 0)
             else:
                 cur_video = torch.cat(self.long_memory_buffer,dim = 0)
                 self.temp_short_memory = [i.unsqueeze(0) for i in self.temp_short_memory]
-                cur_short = torch.cat(self.temp_short_memory, dim = 0)
-
-                video_features = torch.cat([cur_video,cur_short], dim = 0)
+                if len(self.temp_short_memory) != 0:
+                    cur_short = torch.cat(self.temp_short_memory, dim = 0)
+                    video_features = torch.cat([cur_video,cur_short], dim = 0)
+                else:
+                    video_features = torch.cat([cur_video], dim = 0)
                 video_features = torch.cat([video_features, cur_image], dim = 0)
 
             cur_video = []
@@ -478,7 +479,6 @@ def encode_videoQformer_visual(self, image):
                 for i in self.long_memory_buffer:
                     while len(i.shape) > 3:
                         i = i.squeeze(0)
-                import pdb;pdb.set_trace()
                 frame_hidden_state = torch.cat(self.long_memory_buffer,dim = 0)
                 position_ids = torch.arange(self.long_memory_length, dtype=torch.long, device=query_tokens.device) 
                 position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
@@ -525,7 +525,6 @@ def prompt_wrap(self, img_embeds, atts_img, prompt):
             return img_embeds, atts_img
 
     def forward(self, samples):
-        import pdb;pdb.set_trace()
         if 'conv_type' in samples.keys() and samples['conv_type']=='multi':
             im_patch_token_id = self.IMAGE_PATCH_TOKEN_ID
             image = samples["images"]

diff --git a/__MACOSX/Video- LLAMa/._breakpoint.json b/__MACOSX/Video- LLAMa/._breakpoint.json
diff --git a/__MACOSX/Video- LLAMa/._global.json b/__MACOSX/Video- LLAMa/._global.json
diff --git a/__MACOSX/Video-Chat/._breakpoint.json b/__MACOSX/Video-Chat/._breakpoint.json
diff --git a/__MACOSX/Video-Chat/._global.json b/__MACOSX/Video-Chat/._global.json
diff --git a/__MACOSX/Video-Chatgpt/._breakpoint.json b/__MACOSX/Video-Chatgpt/._breakpoint.json
diff --git a/__MACOSX/Video-Chatgpt/._global.json b/__MACOSX/Video-Chatgpt/._global.json
diff --git a/baseline_result_short/activitynet-qa.json b/baseline_result_short/activitynet-qa.json
diff --git a/baseline_result_short/cal_varies.py b/baseline_result_short/cal_varies.py
diff --git a/baseline_result_short/generic_qa.json b/baseline_result_short/generic_qa.json
diff --git a/baseline_result_short/temporal_qa.json b/baseline_result_short/temporal_qa.json
diff --git a/baseline_results_moviechat1k/new_videochat_breakpoint.json b/baseline_results_moviechat1k/new_videochat_breakpoint.json
diff --git a/baseline_results_moviechat1k/new_videochat_breakpoint.txt b/baseline_results_moviechat1k/new_videochat_breakpoint.txt
diff --git a/baseline_results_moviechat1k/new_videochat_global.json b/baseline_results_moviechat1k/new_videochat_global.json
diff --git a/baseline_results_moviechat1k/new_videochat_global.txt b/baseline_results_moviechat1k/new_videochat_global.txt
diff --git a/baseline_results_moviechat1k/new_videochatgpt_breakpoint.json b/baseline_results_moviechat1k/new_videochatgpt_breakpoint.json
diff --git a/baseline_results_moviechat1k/new_videochatgpt_breakpoint.txt b/baseline_results_moviechat1k/new_videochatgpt_breakpoint.txt
diff --git a/baseline_results_moviechat1k/new_videochatgpt_global.json b/baseline_results_moviechat1k/new_videochatgpt_global.json
diff --git a/baseline_results_moviechat1k/new_videochatgpt_global.txt b/baseline_results_moviechat1k/new_videochatgpt_global.txt
diff --git a/baseline_results_moviechat1k/new_videollama_breakpoint.json b/baseline_results_moviechat1k/new_videollama_breakpoint.json
diff --git a/baseline_results_moviechat1k/new_videollama_breakpoint.txt b/baseline_results_moviechat1k/new_videollama_breakpoint.txt
diff --git a/baseline_results_moviechat1k/new_videollama_global.json b/baseline_results_moviechat1k/new_videollama_global.json
diff --git a/baseline_results_moviechat1k/new_videollama_global.txt b/baseline_results_moviechat1k/new_videollama_global.txt
@@ -0,0 +1,94 @@
+{"question": "When does the things in the video happens, decades ago, now, or in the future?", "answer": "It happens decades ago, most likely during a world war.", "pred": "Scenes in past, future, and ambiguous time periods."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various ages shown from ancient to future on beach."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Man and woman in different time periods."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?(Does the technology exists on earth?)", "answer": "It happens at future.", "pred": "Unclear time period, possible future technology."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "No specific time period shown in video."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Old video of African wildlife."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Various scenes depict people, animals, and activities."}
+{"question": "Is there more than five different characters appear(include the blue human-like creature)?", "answer": "Yes.", "pred": "Yes, many different characters are present."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, video shows coral reef and fish at night."}
+{"question": "Is there more than five different characters appear(not including people in the background)?", "answer": "No.", "pred": "Yes, multiple characters with varying presence."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Person climbs glaciers, mountains, ocean, landscapes shown."}
+{"question": "Is there more than five different people appear?", "answer": "Yes.", "pred": "Yes, more than five people appear but not always visible."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Video shot in modern natural locations with waterfall, beach, rocks, cliff."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Different time periods depicted in video with appropriate descriptions."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Man fishing in ocean with surfboard and boat at sunset."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Various natural locations and objects from different time periods."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No specific location mentioned, various outdoor environments."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern era setting."}
+{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Multiple characters with different outfits in video."}
+{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "At night, with moon and stars visible."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Mixed ancient and modern scenery in Sanya, China."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various people in different situations, no specific time period."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various settings, costumes, and wine scenes depicted."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Video shows people driving, flying, and using various vehicles."}
+{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, different characters but uncertain number."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Time travel through ancient times with various cultural scenes."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No city-specific mention, but buildings visible in background."}
+{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Yes, at night."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Current video in water environment."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Ancient and modern scenes featuring different characters and settings."}
+{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Not specified, day or night."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Various locations shown, no cities mentioned."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern video with possible ancient setting."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various settings, uncertain time."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No cities, just underwater with sea life."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Ocean or large body of water."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, underwater world in Hangzhou, China."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "No specific information about time or location."}
+{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters appear at different times."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various historical and futuristic settings depicted in the video."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Time period is ambiguous due to mixed descriptions."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Present time video showcases old desert town buildings."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Events in video have unclear time frame."}
+{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Video depicts events in different time periods."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at future.", "pred": "Video shows mix of past, present, and future concepts."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age, maybe one decade ago.", "pred": "Unclear era, variously dressed people, blonde wig, ancient man."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various people in different locations looking at each other."}
+{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, various characters appear in the video."}
+{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Various activities and events happening in different settings."}
+{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters including young boys, women, and men."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Different scenes in desert at night with moon."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Present events depict past or future scenarios."}
+{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Shows day/night scenes, building interior, dark room at night."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, various scenes of city, volcano, ocean captured."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern.", "pred": "Various scenes with people in different environments."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Time periods not specifically mentioned in video."}
+{"question": "Is there more than ten different characters appear?", "answer": "No.", "pred": "Few characters, mostly men and women in different shots."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, in cities."}
+{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Daytime scenes transition to shadows and darkness."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Video shows scenes of people in different time periods."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern time setting."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video featuring ancient and modern scenes of various locations."}
+{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Current time is mentioned due to tourist activity."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Undetermined time travel through various eras."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Various nighttime activities in cities captured in video."}
+{"question": "Is there more than five different characters appear(include the blue human-like creature)?", "answer": "Yes.", "pred": "Different characters appear in various scenes with weapons."}
+{"question": "When does the things in the video happens, decades ago, now, or in the future?", "answer": "It happens decades ago, most likely during a world war.", "pred": "Video shows American soldiers in WWII fighting the Japanese army."}
+{"question": "Is there more than three different characters appear(not including people in the background)?", "answer": "Yes.", "pred": "Various characters appear throughout the video."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Video depicts various scenes of people with horses through time."}
+{"question": "How many man and woman appears in the video?", "answer": "No human appear in the video.", "pred": "Gender distribution in video unknown."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Futuristic scenes with people in hot tubs and tents."}
+{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Events occur in the present time."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, footage shows ocean and sea in cities."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No city-specific information mentioned in the video."}
+{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, different characters in some descriptions."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various scenes of people on horses in different places."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Time period in video uncertain, possibly past, future, and different locations."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Unclear era; possible past or future scenario."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at future.", "pred": "Various time periods and futuristic settings depicted in video scenes."}
+{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Night and daytime scenes are included."}
+{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No, urban areas are not visible."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Can't tell. It maybe happen at future age.", "pred": "Scenes vary with different time periods and settings."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video shows different ages and concept of time."}
+{"question": "Is there more than three different characters appear(not including people in the background)?", "answer": "No.", "pred": "Yes, many different characters appear in the video."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Multiple historical periods depicted in the video."}
+{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Past and present events depicted in real-time."}
+{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Video events occur in modern times."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various landscapes and eras shown in National Parks video."}
+{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Daytime video."}
+{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Uncertain, but possibly at night."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video covers various eras: ancient, modern, and future."}
+{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters with different appearance and behavior."}
+{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "People exploring ancient city with space references."}
+{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Black and white video with people and car."}
diff --git a/baseline_results_moviechat1k/videochat_breakpoint.json b/baseline_results_moviechat1k/videochat_breakpoint.json
-Original file line number
+Diff line change
@@ Expand Up / @@ -29,3 +29,7 @@ MANIFEST @@
     data/
     ckpt/
     *.sh
+    output/
+    scripts/
+    dataset/
+    src/