Skip to content

Commit

Permalink
master
Browse files Browse the repository at this point in the history
  • Loading branch information
rese1f committed Oct 30, 2023
1 parent 2b537af commit 6537a3f
Show file tree
Hide file tree
Showing 78 changed files with 19,552 additions and 41 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ MANIFEST
data/
ckpt/
*.sh
output/
scripts/
dataset/
src/
27 changes: 13 additions & 14 deletions MovieChat/models/moviechat.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def __init__(
fusion_head_layers = 2,
num_video_query_token = 32,
short_memory_length = 18,
long_memory_length = 64,
long_memory_length = 256,
short_memory_merge = 2,
Qformer_input = 8
):
Expand Down Expand Up @@ -292,7 +292,6 @@ def encode_short_memory_frame(self, videofragment, n_frame:int = 16):
similar_list.append(frame_silimar)

for frame in self.short_memory_buffer:

self.long_memory_buffer.append(frame)

def encode_long_video(self, cur_image, middle_video:False):
Expand All @@ -303,7 +302,7 @@ def encode_long_video(self, cur_image, middle_video:False):
self.long_memory_buffer = [i.unsqueeze(0) for i in self.long_memory_buffer]

# expand position embedding
n_position = 8
n_position = 16
position_ids = torch.arange(n_position).long().to(self.query_tokens.device)
position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
p = self.video_frame_position_embedding(position_ids).squeeze(0)
Expand All @@ -326,22 +325,24 @@ def encode_long_video(self, cur_image, middle_video:False):
frame_position_embeddings = torch.cat(frame_position_embeddings, dim = 0)

if middle_video:
cur_long_length = len(self.long_memory_buffer)
cur_short_length = len(self.temp_short_memory)

while (cur_long_length+cur_short_length+1) > self.max_frame_pos:
self.temp_short_memory.pop(0)
while (len(self.long_memory_buffer)+len(self.temp_short_memory)+1) > frame_position_embeddings.shape[0]:
if len(self.temp_short_memory) != 0:
self.temp_short_memory.pop(0)
else:
self.long_memory_buffer.pop(0)

if len(self.long_memory_buffer) == 0:
self.temp_short_memory = [i.unsqueeze(0) for i in self.temp_short_memory]
cur_short = torch.cat(self.temp_short_memory, dim = 0)
video_features = torch.cat([video_features, cur_image], dim = 0)
video_features = torch.cat([cur_short], dim = 0)
else:
cur_video = torch.cat(self.long_memory_buffer,dim = 0)
self.temp_short_memory = [i.unsqueeze(0) for i in self.temp_short_memory]
cur_short = torch.cat(self.temp_short_memory, dim = 0)

video_features = torch.cat([cur_video,cur_short], dim = 0)
if len(self.temp_short_memory) != 0:
cur_short = torch.cat(self.temp_short_memory, dim = 0)
video_features = torch.cat([cur_video,cur_short], dim = 0)
else:
video_features = torch.cat([cur_video], dim = 0)
video_features = torch.cat([video_features, cur_image], dim = 0)

cur_video = []
Expand Down Expand Up @@ -478,7 +479,6 @@ def encode_videoQformer_visual(self, image):
for i in self.long_memory_buffer:
while len(i.shape) > 3:
i = i.squeeze(0)
import pdb;pdb.set_trace()
frame_hidden_state = torch.cat(self.long_memory_buffer,dim = 0)
position_ids = torch.arange(self.long_memory_length, dtype=torch.long, device=query_tokens.device)
position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
Expand Down Expand Up @@ -525,7 +525,6 @@ def prompt_wrap(self, img_embeds, atts_img, prompt):
return img_embeds, atts_img

def forward(self, samples):
import pdb;pdb.set_trace()
if 'conv_type' in samples.keys() and samples['conv_type']=='multi':
im_patch_token_id = self.IMAGE_PATCH_TOKEN_ID
image = samples["images"]
Expand Down
Binary file added __MACOSX/Video- LLAMa/._breakpoint.json
Binary file not shown.
Binary file added __MACOSX/Video- LLAMa/._global.json
Binary file not shown.
Binary file added __MACOSX/Video-Chat/._breakpoint.json
Binary file not shown.
Binary file added __MACOSX/Video-Chat/._global.json
Binary file not shown.
Binary file added __MACOSX/Video-Chatgpt/._breakpoint.json
Binary file not shown.
Binary file added __MACOSX/Video-Chatgpt/._global.json
Binary file not shown.
1 change: 1 addition & 0 deletions baseline_result_short/activitynet-qa.json

Large diffs are not rendered by default.

Empty file.
1 change: 1 addition & 0 deletions baseline_result_short/generic_qa.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions baseline_result_short/temporal_qa.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions baseline_results_moviechat1k/new_videochat_breakpoint.json

Large diffs are not rendered by default.

981 changes: 981 additions & 0 deletions baseline_results_moviechat1k/new_videochat_breakpoint.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions baseline_results_moviechat1k/new_videochat_global.json

Large diffs are not rendered by default.

279 changes: 279 additions & 0 deletions baseline_results_moviechat1k/new_videochat_global.txt

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1,005 changes: 1,005 additions & 0 deletions baseline_results_moviechat1k/new_videochatgpt_breakpoint.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions baseline_results_moviechat1k/new_videochatgpt_global.json

Large diffs are not rendered by default.

294 changes: 294 additions & 0 deletions baseline_results_moviechat1k/new_videochatgpt_global.txt

Large diffs are not rendered by default.

Large diffs are not rendered by default.

980 changes: 980 additions & 0 deletions baseline_results_moviechat1k/new_videollama_breakpoint.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions baseline_results_moviechat1k/new_videollama_global.json

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions baseline_results_moviechat1k/new_videollama_global.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{"question": "When does the things in the video happens, decades ago, now, or in the future?", "answer": "It happens decades ago, most likely during a world war.", "pred": "Scenes in past, future, and ambiguous time periods."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various ages shown from ancient to future on beach."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Man and woman in different time periods."}
{"question": "When does the things in the video happens, ancient age, modern age or future?(Does the technology exists on earth?)", "answer": "It happens at future.", "pred": "Unclear time period, possible future technology."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "No specific time period shown in video."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Old video of African wildlife."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Various scenes depict people, animals, and activities."}
{"question": "Is there more than five different characters appear(include the blue human-like creature)?", "answer": "Yes.", "pred": "Yes, many different characters are present."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, video shows coral reef and fish at night."}
{"question": "Is there more than five different characters appear(not including people in the background)?", "answer": "No.", "pred": "Yes, multiple characters with varying presence."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Person climbs glaciers, mountains, ocean, landscapes shown."}
{"question": "Is there more than five different people appear?", "answer": "Yes.", "pred": "Yes, more than five people appear but not always visible."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Video shot in modern natural locations with waterfall, beach, rocks, cliff."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Different time periods depicted in video with appropriate descriptions."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Man fishing in ocean with surfboard and boat at sunset."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Various natural locations and objects from different time periods."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No specific location mentioned, various outdoor environments."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern era setting."}
{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Multiple characters with different outfits in video."}
{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "At night, with moon and stars visible."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Mixed ancient and modern scenery in Sanya, China."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various people in different situations, no specific time period."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various settings, costumes, and wine scenes depicted."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Video shows people driving, flying, and using various vehicles."}
{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, different characters but uncertain number."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Time travel through ancient times with various cultural scenes."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No city-specific mention, but buildings visible in background."}
{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Yes, at night."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Current video in water environment."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Ancient and modern scenes featuring different characters and settings."}
{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Not specified, day or night."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Various locations shown, no cities mentioned."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern video with possible ancient setting."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various settings, uncertain time."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No cities, just underwater with sea life."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Ocean or large body of water."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, underwater world in Hangzhou, China."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "No specific information about time or location."}
{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters appear at different times."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various historical and futuristic settings depicted in the video."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Time period is ambiguous due to mixed descriptions."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Present time video showcases old desert town buildings."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Events in video have unclear time frame."}
{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Video depicts events in different time periods."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at future.", "pred": "Video shows mix of past, present, and future concepts."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age, maybe one decade ago.", "pred": "Unclear era, variously dressed people, blonde wig, ancient man."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various people in different locations looking at each other."}
{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, various characters appear in the video."}
{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Various activities and events happening in different settings."}
{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters including young boys, women, and men."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Different scenes in desert at night with moon."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Present events depict past or future scenarios."}
{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Shows day/night scenes, building interior, dark room at night."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, various scenes of city, volcano, ocean captured."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern.", "pred": "Various scenes with people in different environments."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Time periods not specifically mentioned in video."}
{"question": "Is there more than ten different characters appear?", "answer": "No.", "pred": "Few characters, mostly men and women in different shots."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, in cities."}
{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Daytime scenes transition to shadows and darkness."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Video shows scenes of people in different time periods."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Modern time setting."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video featuring ancient and modern scenes of various locations."}
{"question": "When does the video take place? Nowadays or ancient times?", "answer": "It happens at nowadays.", "pred": "Current time is mentioned due to tourist activity."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Undetermined time travel through various eras."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Various nighttime activities in cities captured in video."}
{"question": "Is there more than five different characters appear(include the blue human-like creature)?", "answer": "Yes.", "pred": "Different characters appear in various scenes with weapons."}
{"question": "When does the things in the video happens, decades ago, now, or in the future?", "answer": "It happens decades ago, most likely during a world war.", "pred": "Video shows American soldiers in WWII fighting the Japanese army."}
{"question": "Is there more than three different characters appear(not including people in the background)?", "answer": "Yes.", "pred": "Various characters appear throughout the video."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Video depicts various scenes of people with horses through time."}
{"question": "How many man and woman appears in the video?", "answer": "No human appear in the video.", "pred": "Gender distribution in video unknown."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Futuristic scenes with people in hot tubs and tents."}
{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Events occur in the present time."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "Yes, footage shows ocean and sea in cities."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No city-specific information mentioned in the video."}
{"question": "Is there more than three different characters appear?", "answer": "Yes.", "pred": "Yes, different characters in some descriptions."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Various scenes of people on horses in different places."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Time period in video uncertain, possibly past, future, and different locations."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Unclear era; possible past or future scenario."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at future.", "pred": "Various time periods and futuristic settings depicted in video scenes."}
{"question": "Does it happen during day or night?", "answer": "Day and night.", "pred": "Night and daytime scenes are included."}
{"question": "Does it happen in the cities?", "answer": "No.", "pred": "No, urban areas are not visible."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Can't tell. It maybe happen at future age.", "pred": "Scenes vary with different time periods and settings."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video shows different ages and concept of time."}
{"question": "Is there more than three different characters appear(not including people in the background)?", "answer": "No.", "pred": "Yes, many different characters appear in the video."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Ancient age.", "pred": "Multiple historical periods depicted in the video."}
{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Past and present events depicted in real-time."}
{"question": "When does the things in the video happens, acient, modern, or in the future?", "answer": "It happens at modern time.", "pred": "Video events occur in modern times."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Various landscapes and eras shown in National Parks video."}
{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Daytime video."}
{"question": "Does it happen during day or night?", "answer": "Day.", "pred": "Uncertain, but possibly at night."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "Modern age.", "pred": "Video covers various eras: ancient, modern, and future."}
{"question": "Is there more than five different characters appear?", "answer": "Yes.", "pred": "Yes, multiple characters with different appearance and behavior."}
{"question": "When does it happen in the video, ancient age, modern age or future?", "answer": "Modern age.", "pred": "People exploring ancient city with space references."}
{"question": "When does the things in the video happens, ancient age, modern age or future?", "answer": "It happens at modern age.", "pred": "Black and white video with people and car."}
1 change: 1 addition & 0 deletions baseline_results_moviechat1k/videochat_breakpoint.json

Large diffs are not rendered by default.

Loading

0 comments on commit 6537a3f

Please sign in to comment.