dexaai · transitive-bullshit · Feb 26, 2024 · Feb 25, 2024 · Feb 25, 2024 · Feb 25, 2024
diff --git a/bin/debug-answer-engine.ts b/bin/debug-answer-engine.ts
@@ -90,6 +90,7 @@ async function main() {
     }
   )
 
+  console.log(`logging ${batch.messages.length} message threads to stderr...`)
   console.log()
   console.warn(JSON.stringify(answerEngineQueries, null, 2))
 }

diff --git a/bin/debug-scrape-url.ts b/bin/debug-scrape-url.ts
@@ -1,3 +1,5 @@
+import pMap from 'p-map'
+
 import '../src/config.js'
 import { ScraperClient } from '../src/services/scraper-client.js'
 import { omit } from '../src/utils.js'
@@ -8,10 +10,80 @@ import { omit } from '../src/utils.js'
 async function main() {
   const scraperClient = new ScraperClient()
 
-  const res = await scraperClient.scrapeUrl(
-    'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html'
+  const urls = [
+    'https://www.nytimes.com/2023/05/31/magazine/ai-start-up-accelerator-san-francisco.html',
+    'https://www.youtube.com/watch?v=NNgdcn4Ux1k&ab_channel=LexClips',
+    'https://digg.com/memes-ranked/link/best-memes-ranked-pussy-in-bio-mandela-effect-room-space?utm_source=digg',
+    'https://platform.openai.com/docs/guides/vision',
+    'https://en.wikipedia.org/wiki/Larry_Page',
+    'https://www.flowrestling.org/articles/12162675-oklahoma-state-wrestling-on-the-hunt-for-upsets-against-iowa',
+    'https://github.com/transitive-bullshit/lqip-modern',
+    'https://www.gatesnotes.com/AI-agents',
+    'https://blog.eladgil.com/p/early-days-of-ai',
+    'https://bair.berkeley.edu/blog/2024/02/18/compound-ai-systems/',
+    'https://www.bbc.com/news/business-68387018',
+    'https://www.bbc.com/sport/football/68395310',
+    'https://www.kayak.com/',
+    'https://marmelab.com/blog/2024/01/23/react-19-new-hooks.html?ref=labnotes.org',
+    'https://www.foxnews.com/us/ai-technology-could-help-us-allies-monitor-chinas-taiwan-invasion-intensions',
+    'https://twitter.com/paulg/status/1761731253764579573',
+    'https://twitter.com/transitive_bs',
+    'https://transitivebullsh.it/chatgpt-twitter-bot-lessons',
+    'https://www.swyx.io/learn-in-public',
+    'https://leerob.io/blog/developer-experience-examples',
+    'https://rauchg.com/2021/making-the-web-faster',
+    'https://blog.google/products/gemini/bard-gemini-advanced-app/',
+    'https://apnews.com/article/2024-qatar-swimming-worlds-underwater-camera-splash',
+    'https://www.amazon.com/Deepness-Sky-Zones-Thought-Book-ebook/dp/B002H8ORKM/?_encoding=UTF8&pd_rd_w=4N09q&content-id=amzn1.sym.379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_p=379956f8-690b-4143-ad17-ba606cbec0c1&pf_rd_r=NXZSG4MAQ5P40FP5T5ZR&pd_rd_wg=t7KmU&pd_rd_r=5c051a29-61a2-468a-bc68-ad2754e52d05&ref_=pd_gw_bmx27b',
+    'https://www.reddit.com/r/MadeMeSmile/comments/u33nuc/he_finally_got_his_acorn/',
+    'https://www.reddit.com/r/Damnthatsinteresting/comments/ujl32z/this_is_jeanbaptiste_kempf_the_creator_of_vlc/',
+    'https://news.ycombinator.com/item?id=35154527',
+    'https://news.ycombinator.com/item?id=11116274',
+    'https://www.bbc.com/news/uk-43396008',
+    'https://www.apple.com/customer-letter/',
+    'https://openai.com/blog/openai-announces-leadership-transition',
+    'https://www.apple.com/stevejobs/', // output includes some weird #{ref} stuff
+    'https://groups.google.com/g/vim_announce/c/tWahca9zkt4?pli=1',
+    'https://bensbites.beehiiv.com/',
+    'https://bensbites.beehiiv.com/p/open-ai-serious-building-new-app-store',
+    'https://anilist.co/anime/1/Cowboy-Bebop/',
+    'https://dexa.ai/',
+    'https://dexa.ai/s/S7RDMg3f',
+    'https://www.quora.com/What-can-I-learn-know-right-now-in-10-minutes-that-will-be-useful-for-the-rest-of-my-life',
+    'https://www.quora.com/How-do-top-students-study',
+    'https://www.quora.com/What-are-the-most-surreal-places-to-visit',
+    'https://www.instagram.com/p/BTKd8z2jM14/?img_index=1',
+    'https://www.linkedin.com/in/fisch2/',
+    'https://www.facebook.com/zuck/',
+    'https://github.com/sindresorhus',
+    'https://www.pornhub.com/',
+    'https://www.tiktok.com/@zachking/video/6768504823336815877?embed_source=71929438%2C121374463%2C121351166%2C121331973%2C120811592%2C120810756%3Bnull%3Bembed_blank&refer=embed&referer_url=metricool.com%2Ftiktoks-most-viral-videos%2F&referer_video_id=6768504823336815877',
+    'https://www.tiktok.com/@zachking/video/6749520869598481669'
+  ]
+
+  const results = (
+    await pMap(
+      urls,
+      async (url) => {
+        try {
+          return await scraperClient.scrapeUrl(url)
+        } catch (err: any) {
+          console.error('error processing url', url, err.toString())
+        }
+      },
+      {
+        concurrency: 4
+      }
+    )
+  ).filter(Boolean)
+
+  console.log(
+    JSON.stringify(
+      results.map((res) => omit(res, 'content', 'rawHtml')),
+      null,
+      2
+    )
   )
-  console.log(JSON.stringify(omit(res, 'content', 'rawHtml'), null, 2))
 }
 
 main()

diff --git a/readme.md b/readme.md
@@ -82,16 +82,13 @@ Flags:
 ## TODO
 
 - understand why mentions from non-verified accounts aren't being reported by the twitter api
-- support quote tweet and retweet context
-- support user entity context
-- add test fixtures testing these different use cases
 - fix support for empty mentions
   - currently works but duplicates the previous tweet's contents
 - support `url` entities
   - expand them with metadata
 - support `media` entities
   - populate media entities
-  - openai use gpt-4-vision-preview
+  - for openai, use gpt-4-vision-preview
   - conditionally preprocess images using `sharp` to ensure they are supported by gpt4v
 - improve openai answer engine
   - dalle tool

diff --git a/src/__snapshots__/entities.test.ts.snap b/src/__snapshots__/entities.test.ts.snap
@@ -19,6 +19,7 @@ exports[`mergeEntityMaps 1`] = `
       "type": "tweet",
     },
   },
+  "urls": {},
   "users": {
     "1235525929335689217": {
       "name": "Lofi Grind",
@@ -67,6 +68,7 @@ exports[`mergeEntityMaps 2`] = `
     "1760384146004996333": {},
     "test": {},
   },
+  "urls": {},
   "users": {
     "1235525929335689217": {},
     "327034465": {},

diff --git a/src/answer-engine.test.ts b/src/answer-engine.test.ts
@@ -10,8 +10,8 @@ import { getTwitterClient } from './services/twitter-client.js'
 import { rUrl } from './utils.js'
 
 const fixtures = fixturesData as unknown as types.AnswerEngineQuery[]
-// const answerEngines = [new OpenAIAnswerEngine(), new DexaAnswerEngine()]
-const answerEngines = [new OpenAIAnswerEngine()]
+const answerEngines = [new OpenAIAnswerEngine(), new DexaAnswerEngine()]
+// const answerEngines = [new OpenAIAnswerEngine()]
 
 for (const answerEngine of answerEngines) {
   describe(`${answerEngine.type} answer engine`, async () => {
@@ -40,7 +40,10 @@ for (const answerEngine of answerEngines) {
             ctx
           )
 
-          console.log(`${answerEngine.type} tweet ${tweetUrl} ⇒`, response)
+          console.log(
+            `\n**QUESTION** ${tweetUrl}\n\n**ANSWER**\n\n${response}\n\n`
+          )
+
           assert(response.length > 0, 'response should not be empty')
           assert(response.trim() === response, 'response should be trimmed')
 

diff --git a/src/answer-engine.ts b/src/answer-engine.ts
@@ -1,17 +1,22 @@
-import { Msg } from '@dexaai/dexter'
+import { Msg, stringifyForModel } from '@dexaai/dexter'
 import pMap from 'p-map'
 
 import * as config from '../src/config.js'
 import * as db from './db.js'
 import type * as types from './types.js'
 import { BotError } from './bot-error.js'
 import {
-  type EntitiesMap,
-  convertTweetToEntitiesMap,
+  type EntityMap,
+  convertTweetToEntityMap,
   mergeEntityMaps
 } from './entities.js'
-import { sanitizeTweetText, stripUserMentions } from './twitter-utils.js'
-import { assert } from './utils.js'
+import {
+  getPrunedTweet,
+  getPrunedTwitterUser,
+  sanitizeTweetText,
+  stripUserMentions
+} from './twitter-utils.js'
+import { assert, pick } from './utils.js'
 
 export abstract class AnswerEngine {
   readonly type: types.AnswerEngineType
@@ -25,7 +30,10 @@ export abstract class AnswerEngine {
     ctx: types.AnswerEngineContext
   ) {
     const query = await this.resolveMessageThread(message, ctx)
-    console.log(`>>> ${this.type} answer engine`, query)
+    console.log(
+      `\n>>> ${this.type} answer engine`,
+      pick(query, 'message', 'chatMessages', 'tweets', 'entityMap')
+    )
 
     message.response = await this.generateResponseForQuery(query, ctx)
 
@@ -46,6 +54,9 @@ export abstract class AnswerEngine {
     }
 
     try {
+      // replace markdown lists with unicode bullet points
+      response = response.replaceAll(/^\s*-\s+/gm, '• ')
+
       response = sanitizeTweetText(response, {
         label: `generated by answer engine "${this.type}"`
       })
@@ -161,10 +172,7 @@ export abstract class AnswerEngine {
           ...Msg.user(tweet.text, {
             name: userIdToUsernameMap[tweet.author_id!]
           }),
-
-          entities: {
-            tweetIds: [tweet.id]
-          }
+          tweetId: tweet.id
         })
       )
 
@@ -175,23 +183,15 @@ export abstract class AnswerEngine {
             ...Msg.user(message.prompt, {
               name: userIdToUsernameMap[message.promptUserId]
             }),
-
-            entities: {
-              tweetIds: [message.promptTweetId]
-            }
+            tweetId: message.promptTweetId
           },
 
           message.response && message !== leafMessage
             ? {
                 ...Msg.assistant(message.response!, {
                   name: userIdToUsernameMap[ctx.twitterBotUserId]
                 }),
-
-                entities: {
-                  tweetIds: message.responseTweetId
-                    ? [message.responseTweetId!]
-                    : []
-                }
+                tweetId: message.responseTweetId!
               }
             : null
         ].filter(Boolean)
@@ -210,56 +210,83 @@ export abstract class AnswerEngine {
         .reverse()
     }
 
+    const chatMessages = answerEngineMessages.map(
+      ({ tweetId, ...message }) => message
+    )
+
     // Resolve all entity maps for the tweets and messages in the thread and then
     // condense them into a single, normalized enitity map
-    let entityMap: EntitiesMap = {}
-
-    for (const answerEngineMessage of answerEngineMessages) {
-      if (!answerEngineMessage.entities?.tweetIds) continue
-
-      for (const tweetId of answerEngineMessage.entities.tweetIds) {
-        if (entityMap.tweets?.[tweetId]) continue
-
-        const tweet = await db.tryGetTweetById(tweetId, ctx, {
-          fetchFromTwitter: false
-        })
-        if (!tweet) continue
-
-        const tweetEntityMap = await convertTweetToEntitiesMap(tweet, ctx, {
-          fetchMissingEntities: true
-        })
-
-        entityMap = mergeEntityMaps(entityMap, tweetEntityMap)
-      }
-    }
+    let entityMap: EntityMap = {}
 
     // Construct a raw array of tweets to pass to the answer engine, which may
-    // be easier to work with than our structured AnswerEngineMessage format
+    // be easier to work with than our AnswerEngineMessage format
     const tweets = (
       await pMap(
         answerEngineMessages,
         async (message) => {
-          const tweetId = message.entities?.tweetIds?.[0]
+          const { tweetId } = message
           assert(tweetId)
 
           const tweet = await db.tryGetTweetById(tweetId, ctx, {
             fetchFromTwitter: true
           })
           if (!tweet) return
 
-          return tweet
+          const tweetEntityMap = await convertTweetToEntityMap(tweet, ctx, {
+            fetchMissingEntities: true
+          })
+
+          entityMap = mergeEntityMaps(entityMap, tweetEntityMap)
+
+          return getPrunedTweet(tweet)
         },
         {
           concurrency: 8
         }
       )
     ).filter(Boolean)
 
+    const rawChatMessages = tweets.map((tweet) =>
+      tweet.author_id === ctx.twitterBotUserId
+        ? Msg.assistant(stringifyForModel(tweet), {
+            name: userIdToUsernameMap[tweet.author_id!]
+          })
+        : Msg.user(stringifyForModel(tweet), {
+            name: userIdToUsernameMap[tweet.author_id!]
+          })
+    )
+
+    const rawEntityMap: types.RawEntityMap = {
+      users: {},
+      tweets: {},
+      urls: entityMap.urls ?? {}
+    }
+
+    if (entityMap?.users) {
+      for (const user of Object.values(entityMap.users)) {
+        assert(user.twitterId)
+        const twitterUser = await db.tryGetUserById(user.twitterId)
+        if (!twitterUser) continue
+        rawEntityMap.users[user.twitterId] = getPrunedTwitterUser(twitterUser)
+      }
+    }
+
+    if (entityMap?.tweets) {
+      for (const tweet of Object.values(entityMap.tweets)) {
+        assert(tweet.id)
+        const twittertweet = await db.tryGetTweetById(tweet.id, ctx)
+        if (!twittertweet) continue
+        rawEntityMap.tweets[tweet.id] = getPrunedTweet(twittertweet)
+      }
+    }
+
     return {
       message,
-      answerEngineMessages,
+      chatMessages,
+      rawChatMessages,
       tweets,
-      entityMap
+      entityMap,
+      rawEntityMap
     }
   }
 }
diff --git a/src/answer-engines/dexa-answer-engine.ts b/src/answer-engines/dexa-answer-engine.ts
@@ -16,7 +16,7 @@ export class DexaAnswerEngine extends AnswerEngine {
     ctx: types.AnswerEngineContext
   ): Promise<string> {
     return this._dexaClient.generateResponse({
-      messages: query.answerEngineMessages,
+      messages: query.chatMessages,
       entityMap: query.entityMap
     })
   }
-Original file line number
+Diff line change
@@ Expand Up / @@ -90,6 +90,7 @@ async function main() { @@
         }
       )
+      console.log(`logging ${batch.messages.length} message threads to stderr...`)
       console.log()
       console.warn(JSON.stringify(answerEngineQueries, null, 2))
     }
@@ Expand Down @@