Merge pull request #170 from stanford-crfm/jonathan/0428-weekly-assets

add new + notable assets
stanford-crfm · May 8, 2024 · ebd9224 · ebd9224
2 parents bc33b7e + d3c06a4
commit ebd9224
Show file tree

Hide file tree

Showing 24 changed files with 567 additions and 10 deletions.
diff --git a/assets/adobe.yaml b/assets/adobe.yaml
@@ -97,3 +97,25 @@
   monthly_active_users: unknown
   user_distribution: unknown
   failures: unknown
+
+- type: dataset
+  name: CulturaX
+  organization: University of Oregon, Adobe
+  description: CulturaX is a substantial multilingual dataset with 6.3 trillion tokens in 167 languages, tailored for LLM development. 
+  created_date: 2023-09-17
+  url: https://arxiv.org/pdf/2309.09400
+  datasheet: https://huggingface.co/datasets/uonlp/CulturaX
+  modality: text
+  size: 6.3 trillion tokens
+  sample: []
+  analysis: none
+  dependencies: [mC4, OSCAR]
+  included: unknown
+  excluded: unknown
+  quality_control: unknown
+  access: open
+  license: mC4, OSCAR
+  intended_uses: ''
+  prohibited_uses: The data must not be utilized for malicious or harmful purposes towards humanity.
+  monitoring: unknown
+  feedback: https://huggingface.co/datasets/uonlp/CulturaX/discussions
diff --git a/assets/ai2.yaml b/assets/ai2.yaml
@@ -255,3 +255,25 @@
   prohibited_uses: ''
   monitoring: unknown
   feedback: https://huggingface.co/allenai/OLMo-7B/discussions
+
+- type: dataset
+  name: MADLAD-400
+  organization: AI2
+  description: MADLAD-400 is a document-level multilingual dataset based on Common Crawl, covering 419 languages in total.
+  created_date: 2023-09-09
+  url: https://arxiv.org/abs/2309.04662
+  datasheet: https://huggingface.co/datasets/allenai/MADLAD-400
+  modality: text
+  size: 3 trillion tokens
+  sample: []
+  analysis: none
+  dependencies: [Common Crawl]
+  included: ''
+  excluded: ''
+  quality_control: ''
+  access: open
+  license: CC BY 4.0
+  intended_uses: ''
+  prohibited_uses: ''
+  monitoring: unknown
+  feedback: https://huggingface.co/datasets/allenai/MADLAD-400/discussions
diff --git a/assets/alibaba.yaml b/assets/alibaba.yaml
@@ -87,7 +87,7 @@
 
 - type: model
   name: Qwen 1.5
-  organization: Qwen Team
+  organization: Alibaba
   description: Qwen 1.5 is the next iteration in their Qwen series, consisting of
     Transformer-based large language models pretrained on a large volume of data,
     including web texts, books, codes, etc.
@@ -141,3 +141,27 @@
   prohibited_uses: ''
   monitoring: unknown
   feedback: https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B/discussions
+
+- type: model
+  name: SeaLLM v2.5
+  organization: DAMO Academy, Alibaba
+  description: SeaLLM v2.5 is a multilingual large language model for Southeast Asian (SEA) languages.
+  created_date: 2024-04-12
+  url: https://github.com/DAMO-NLP-SG/SeaLLMs
+  model_card: https://huggingface.co/SeaLLMs/SeaLLM-7B-v2.5
+  modality: text; text
+  analysis: The model was evaluated on 3 benchmarks (MMLU for English, M3Exam (M3e) for English, Chinese, Vietnamese, Indonesian, and Thai, and VMLU for Vietnamese) and it outperformed GPT-3 and Vistral-7B-chat models across these benchmarks in the given languages.
+  size: 7B parameters
+  dependencies: [Gemma]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: Despite efforts in red teaming and safety fine-tuning and enforcement, the creators suggest, developers and stakeholders should perform their own red teaming and provide related security measures before deployment, and they must abide by and comply with local governance and regulations.
+  access: open
+  license:
+    explanation: License can be found at https://huggingface.co/SeaLLMs/SeaLLM-13B-Chat/blob/main/LICENSE
+    value: custom
+  intended_uses: The model is intended for multilingual tasks such as knowledge retrieval, math reasoning, and instruction following. Also, it could be used to provide multilingual assistance.
+  prohibited_uses: The model should not be used in a way that could lead to inaccurate, misleading or potentially harmful generation. Users should comply with local laws and regulations when deploying the model.
+  monitoring: unknown
+  feedback: https://huggingface.co/SeaLLMs/SeaLLM-7B-v2.5/discussions
diff --git a/assets/apple.yaml b/assets/apple.yaml
@@ -22,3 +22,24 @@
   prohibited_uses: ''
   monitoring: ''
   feedback: none
+- type: model
+  name: OpenELM
+  organization: Apple
+  description: OpenELM is a family of Open-source Efficient Language Models. It uses a layer-wise scaling strategy to efficiently allocate parameters within each layer of the transformer model, leading to enhanced accuracy.
+  created_date: 2024-04-24
+  url: https://machinelearning.apple.com/research/openelm
+  model_card: https://huggingface.co/apple/OpenELM-3B-Instruct
+  modality: text; text
+  analysis: The models were evaluated in terms of zero-shot, LLM360, and OpenLLM leaderboard results. 
+  size: 3B parameters
+  dependencies: [RefinedWeb, The Pile, RedPajama-Data, Dolma, CoreNet library]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: unknown
+  access: open
+  license: Apple
+  intended_uses: To empower and enrich the open research community by providing access to state-of-the-art language models.
+  prohibited_uses: No explicit prohibited uses stated, though it is noted that users should undertake thorough safety testing.
+  monitoring: none
+  feedback: https://huggingface.co/apple/OpenELM-3B-Instruct/discussions
diff --git a/assets/cohere.yaml b/assets/cohere.yaml
@@ -543,3 +543,24 @@
   prohibited_uses: ''
   monitoring: unknown
   feedback: https://huggingface.co/datasets/CohereForAI/aya_dataset/discussions
+- type: model
+  name: Rerank 3
+  organization: Cohere
+  description: Rerank 3 is a new foundation model for efficient enterprise search and retrieval with 4k context length.
+  created_date: 2024-04-11
+  url: https://cohere.com/blog/rerank-3
+  model_card: none
+  modality: text; text
+  analysis: Evaluated on code retrieval and data retrieval capabilities, with improvements compared to the standard in both.
+  size: unknown
+  dependencies: []
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: ''
+  access: limited
+  license: unknown
+  intended_uses: Efficient enterprise search and retrieval. 
+  prohibited_uses: ''
+  monitoring: unknown
+  feedback: none
diff --git a/assets/eleutherai.yaml b/assets/eleutherai.yaml
@@ -296,3 +296,24 @@
   prohibited_uses: ''
   monitoring: none
   feedback: https://huggingface.co/datasets/EleutherAI/proof-pile-2/discussions
+- type: model
+  name: Pile-T5
+  organization: EleutherAI
+  description: Pile-T5 is a version of the broadly used T5 model, but improved to eliminate weaknesses such as the omission of crucial code-related tokens. It utilizes LLaMA tokenizer and is trained on the Pile, offering enhancements for finetuning on downstream tasks, particularly those involving code.
+  created_date: 2024-04-15
+  url: https://blog.eleuther.ai/pile-t5/
+  model_card: none
+  modality: text; text
+  analysis: The models were evaluated on SuperGLUE, CodeXGLUE, as well as MMLU and Bigbench Hard. Comparisons were made with T5v1.1 and found that Pile-T5 models performed better in most conditions.
+  size: unknown
+  dependencies: [The Pile, T5x, LLaMA, umT5]
+  training_emissions: unknown
+  training_time: 2 million steps
+  training_hardware: unknown
+  quality_control: ''
+  access: open
+  license: unknown
+  intended_uses: The model is aimed at downstream tasks that benefit from the encoder-decoder architecture. Particularly useful for tasks involving code.
+  prohibited_uses: unknown
+  monitoring: unknown
+  feedback: unknown
diff --git a/assets/fuse.yaml b/assets/fuse.yaml
@@ -0,0 +1,22 @@
+---
+- type: model
+  name: FuseChat
+  organization: FuseAI
+  description: FuseChat is a powerful chat Language Learning Model (LLM) that integrates multiple structure and scale-varied chat LLMs using a fuse-then-merge strategy. The fusion is done using two stages
+  created_date: 2024-02-26
+  url: https://arxiv.org/abs/2402.16107
+  model_card: https://huggingface.co/FuseAI/FuseChat-7B-VaRM
+  modality: text; text
+  analysis: The FuseChat model was evaluated on MT-Bench which comprises 80 multi-turn dialogues spanning writing, roleplay, reasoning, math, coding, stem, and humanities domains. It yields an average performance of 66.52 with specific scores for individual domains available in the leaderboard results.
+  size: 7B parameters
+  dependencies: [Nous Hermes 2, OpenChat 3.5]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: none
+  access: open
+  license: Apache 2.0
+  intended_uses: FuseChat is intended to be used as a powerful chat bot that takes in text inputs and provides text-based responses. It can be utilized in a variety of domains including writing, roleplay, reasoning, math, coding, stem, and humanities.
+  prohibited_uses: unknown
+  monitoring: unknown
+  feedback: https://huggingface.co/FuseAI/FuseChat-7B-VaRM/discussions
diff --git a/assets/google.yaml b/assets/google.yaml
@@ -1782,3 +1782,25 @@
     here https://ai.google.dev/gemma/prohibited_use_policy
   monitoring: ''
   feedback: https://huggingface.co/google/gemma-7b/discussions
+- type: model
+  name: Med-Gemini
+  organization: Google
+  description: Med-Gemini is a family of highly capable multimodal models that are specialized in medicine with the ability to seamlessly integrate the use of web search, and that can be efficiently tailored to novel modalities using custom encoders.
+  created_date: 2024-04-29
+  url: https://arxiv.org/pdf/2404.18416
+  model_card: none
+  modality: image, text; text
+  analysis: Evaluated Med-Gemini on 14 medical benchmarks spanning text, multimodal and long-context applications, establishing new state-of-the-art (SoTA) performance on 10 of them, and surpassing the GPT-4 model family on every benchmark where a direct comparison is viable.
+  size: unknown
+  dependencies: [Gemini, MultiMedBench]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: ''
+  access: closed
+  license: unknown
+  intended_uses: To be used in areas of medical research including medical summarization, referral letter generation, and medical simplification tasks.
+  prohibited_uses: Unfit for real-world deployment in the safety-critical medical domain.
+  monitoring: ''
+  feedback: none
+
diff --git a/assets/huggingface.yaml b/assets/huggingface.yaml
@@ -171,3 +171,47 @@
   prohibited_uses: unknown
   monitoring: ''
   feedback: https://huggingface.co/datasets/HuggingFaceTB/cosmopedia/discussions
+- type: model
+  name: Idefics2
+  organization: Hugging Face
+  description: Idefics2 is a general multimodal model that takes as input arbitrary sequences of text and images, generating text responses. It has the capability to describe visual content, answer questions about images, perform basic arithmetic operations, create stories grounded in multiple images, and extract information from documents.
+  created_date: 2024-04-15
+  url: https://huggingface.co/blog/idefics2
+  model_card: https://huggingface.co/HuggingFaceM4/idefics2-8b
+  modality: image, text; text
+  analysis: The performance of Idefics2 has been evaluated on numerous benchmarks. It is top of its class size and competes with much larger models such as LLava-Next-34B and MM1-30B-chat.
+  size: 8B parameters
+  dependencies: [The Cauldron]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: The quality of the model has been ensured by training it on a mixture of openly available datasets and enhancing its OCR capabilities. Further improvements include manipulating images in their native resolutions and aspect ratios, better pre-trained backbones, and allowing for sub-image splitting.
+  access: open
+  license: Apache 2.0
+  intended_uses: The model can be used for answering questions about images, describing visual content, creating stories grounded in multiple images, extracting information from documents, and performing basic arithmetic operations.
+  prohibited_uses: unknown
+  monitoring: unknown
+  feedback: https://huggingface.co/HuggingFaceM4/idefics2-8b/discussions
+- type: dataset
+  name: The Cauldron
+  organization: Hugging Face
+  description: The Cauldron is an open compilation of 50 manually-curated datasets formatted for multi-turn conversations.
+  created_date: 2024-04-15
+  url: https://huggingface.co/blog/idefics2
+  datasheet: https://huggingface.co/datasets/HuggingFaceM4/the_cauldron
+  modality: image, text
+  size: 50 vision-language datasets
+  sample: []
+  analysis: none
+  dependencies: 
+    explanation: These are the datasets with the most tokens included; the full list of all 50 datasets can be found at https://huggingface.co/datasets/HuggingFaceM4/the_cauldron
+    value: [LNarratives, Rendered Text, WebSight, DaTikz]
+  included: ''
+  excluded: ''
+  quality_control: unknown
+  access: open
+  license: CC BY 4.0
+  intended_uses: ''
+  prohibited_uses: ''
+  monitoring: unknown
+  feedback: https://huggingface.co/datasets/HuggingFaceM4/the_cauldron/discussions
diff --git a/assets/konan.yaml b/assets/konan.yaml
@@ -0,0 +1,22 @@
+---
+- type: model
+  name: Konan LLM
+  organization: Konan
+  description: Konan LLM is a Large Language Model developed in-house by Konan Technology. Optimized for super-large AI training, it leverages high-quality, large-scale data and over 20 years of expertise in natural language processing.
+  created_date: 2023-09-17
+  url: https://en.konantech.com/en/llm/konanllm
+  model_card: none
+  modality: text; text
+  analysis: none
+  size: 13B parameters
+  dependencies: []
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: ''
+  access: limited
+  license: unknown
+  intended_uses: Document generation, document review, Q&A, customer response scenarios.
+  prohibited_uses: ''
+  monitoring: ''
+  feedback: none
diff --git a/assets/ktai.yaml b/assets/ktai.yaml
@@ -0,0 +1,22 @@
+---
+- type: model
+  name: Midm
+  organization: KT Corporation
+  description: Midm is a pre-trained Korean-English language model developed by KT. It takes text as input and creates text. The model is based on Transformer architecture for an auto-regressive language model.
+  created_date: 2023-10-31
+  url: https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1
+  model_card: https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1
+  modality: text; text
+  analysis: unknown
+  size: 7B parameters
+  dependencies: [AI-HUB dataset, National Institute of Korean Language dataset]
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: KT tried to remove unethical expressions such as profanity, slang, prejudice, and discrimination from training data.
+  access: open
+  license: CC-BY-NC 4.0
+  intended_uses: It is expected to be used for various research purposes.
+  prohibited_uses: It cannot be used for commercial purposes.
+  monitoring: unknown
+  feedback: https://huggingface.co/KT-AI/midm-bitext-S-7B-inst-v1/discussions
diff --git a/assets/lg.yaml b/assets/lg.yaml
@@ -0,0 +1,22 @@
+---
+- type: model
+  name: EXAONE 2.0
+  organization: LG AI Research
+  description: EXAONE 2.0 is a multimodal artificial intelligence that can be used to help develop new materials and medicines.
+  created_date: 2023-07-19
+  url: https://www.lgresearch.ai/exaone
+  model_card: none
+  modality: image, text; image, text
+  analysis: none
+  size: unknown
+  dependencies: []
+  training_emissions: unknown
+  training_time: unknown
+  training_hardware: unknown
+  quality_control: ''
+  access: closed
+  license: unknown
+  intended_uses: ''
+  prohibited_uses: ''
+  monitoring: ''
+  feedback: none