= ZeroShotClassifier()
+ zsl =['politics', 'elections', 'sports', 'films', 'television']
+ labels= 'I am extremely dissatisfied with the President and will definitely vote in 2020.'
+ doc = zsl.predict(doc, labels=labels, include_labels=True) preds
Analyzers
++ +
list2chunks
++++list2chunks (a, n)
+ +
ZeroShotClassifier
++++ZeroShotClassifier (model_name='facebook/bart-large-mnli', device=None)
Interface to Zero Shot Topic Classifier
++ +
ZeroShotClassifier.predict
++++ZeroShotClassifier.predict (docs, labels=[], include_labels=False, + multilabel=True, max_length=512, + batch_size=8, nli_template='This text is + about {}.', topic_strings=[])
*This method performs zero-shot text classification using Natural Language Inference (NLI).
+Parameters: - docs(list|str): text of document or list of texts - labels(list): a list of strings representing topics of your choice Example: labels=[‘political science’, ‘sports’, ‘science’] - include_labels(bool): If True, will return topic labels along with topic probabilities - multilabel(bool): If True, labels are considered independent and multiple labels can predicted true for document and be close to 1. If False, scores are normalized such that probabilities sum to 1. - max_length(int): truncate long documents to this many tokens - batch_size(int): batch_size to use. default:8 Increase this value to speed up predictions - especially if len(topic_strings) is large. - nli_template(str): labels are inserted into this template for use as hypotheses in natural language inference - topic_strings(list): alias for labels parameter for backwards compatibility
+Returns:
+inferred probabilities or list of inferred probabilities if doc is list*
+ preds
[('politics', 0.9791897535324097),
+ ('elections', 0.9874581098556519),
+ ('sports', 0.0005765464738942683),
+ ('films', 0.002292431192472577),
+ ('television', 0.0010546175763010979)]
+= dict(preds)
+ d assert d['politics'] > 0.9
+assert d['elections'] > 0.9
+assert d['sports'] < 0.1
+assert d['films'] < 0.1
+assert d['television'] < 0.1
+ +
TextEncoder
++++TextEncoder (model_name='stsb-roberta-large', device=None)
Tiny wrapper to sentence-transformers
+= TextEncoder() te
/home/amaiya/mambaforge/envs/pt/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
+ warnings.warn(
+= te.encode('The moon is bright.') e
assert e.shape[0] == 1
+assert e.shape[1] == 1024
+ +
TopicModel
++++TopicModel (texts=None, n_topics=None, n_features=10000, min_df=5, + max_df=0.5, stop_words='english', model_type='lda', + lda_max_iter=5, lda_mode='online', token_pattern=None, + verbose=1, hyperparam_kwargs=None)
Fits a topic model to documents in
from sklearn.datasets import fetch_20newsgroups
# we only want to keep the body of the documents!
+= ('headers', 'footers', 'quotes')
+ remove
+# fetch train and test data
+= fetch_20newsgroups(subset='train', remove=remove)
+ newsgroups_train = fetch_20newsgroups(subset='test', remove=remove)
+ newsgroups_test
+# compile the texts
+= newsgroups_train.data + newsgroups_test.data
+ texts
+# let's also store the newsgroup category associated with each document
+# we can display this information in visualizations
+= [target for target in list(newsgroups_train.target) + list(newsgroups_test.target)]
+ targets = [newsgroups_train.target_names[target] for target in targets] categories
= TopicModel(texts, n_features=10000) tm
n_topics automatically set to 97
+preprocessing texts...
+fitting model...
+iteration: 1 of max_iter: 5
+iteration: 2 of max_iter: 5
+iteration: 3 of max_iter: 5
+iteration: 4 of max_iter: 5
+iteration: 5 of max_iter: 5
+done.
+ tm.print_topics()
topic 0 | tape adam tim case moved bag quote mass marked zionism
+topic 1 | image jpeg images format programs tiff files jfif save lossless
+topic 2 | alternative movie film static cycles films philips dynamic hou phi
+topic 3 | hell humans poster frank reality kent gerard gant eternal bell
+topic 4 | air phd chz kit cbc ups w-s rus w47 mot
+topic 5 | dog math great figure poster couldn don trying rushdie fatwa
+topic 6 | collaboration nazi fact end expression germany philly world certified moore
+topic 7 | gif points scale postscript mirror plane rendering algorithm polygon rayshade
+topic 8 | fonts font shell converted iii characters slight composite breaks compress
+topic 9 | power station supply options option led light tank plastic wall
+topic 10 | transmission rider bmw driver automatic shift gear japanese stick highway
+topic 11 | tyre ezekiel ruler hernia appeared appointed supreme man land power
+topic 12 | space nasa earth data launch surface solar moon mission planet
+topic 13 | israel jews jewish israeli arab peace war arabs palestinian kuwait
+topic 14 | olvwm xremote animals kinds roughing toolkit close corp glenn imakefile
+topic 15 | medical health disease cancer patients drug treatment drugs aids study
+topic 16 | biden chip gear like information number automatic mode insurance know
+topic 17 | graphics zip amiga shareware formats ftp gif program sgi convert
+topic 18 | brilliant mail did god coming christianity people got ideas reading
+topic 19 | black red white blue green cross wires lines helmet mask
+topic 20 | car engine cars miles clutch new ford rear slip road
+topic 21 | list mailing service model small large lists radar available major
+topic 22 | key encryption chip keys clipper phone security use government privacy
+topic 23 | talking pit nyr stl phi edm mtl wsh hfd cgy
+topic 24 | signal input switch connected circuit audio noise output control voltage
+topic 25 | stuff deleted die posting beware fantastic motives authentic reluctant hope
+topic 26 | adams douglas dc-x garrett ingres tin sdio incremental mcdonnell guide
+topic 27 | men homosexual homosexuality women gay sexual homosexuals male kinsey pop
+topic 28 | usual leo rs-232 martian reading cooperative unmanned somalia decompress visited
+topic 29 | edu university information send new computer research mail internet address
+topic 30 | reserve naval marine ret commission one-way irgun prior closure facilities
+topic 31 | state intelligence militia units army zone georgia sam croats belongs
+topic 32 | says article pain known warning doctor stone bug kidney response
+topic 33 | faq rsa ripem lights yes patent nist management wax cipher
+topic 34 | wolverine comics hulk appearance special liefeld sabretooth incredible hobgoblin x-force
+topic 35 | software ram worth cycles controller available make dram dynamic situation
+topic 36 | religion people religious catalog bobby used driven involved long like
+topic 37 | intel sites experiment ftp does know family good like mrs
+topic 38 | armenian people army russian turkish genocide armenians ottoman turks jews
+topic 39 | theft geo available face couldn cover sony people number shop
+topic 40 | christianity did exists mail matter mind tool status god reading
+topic 41 | propane probe earth orbit orbiter titan cassini space atmosphere gravity
+topic 42 | people government right think rights law make public fbi don
+topic 43 | god people does say believe bible true think evidence religion
+topic 44 | mov phone south key war supply push left just registered
+topic 45 | period goal pts play chicago pittsburgh buffalo shots new blues
+topic 46 | game team games year hockey season players player baseball league
+topic 47 | speed dod student technician just hits right note giant light
+topic 48 | sex marriage relationship family married couple depression pregnancy childhood trademark
+topic 49 | protects rejecting com4 couple decides taking connect unc nearest richer
+topic 50 | president states united american national press april washington america white
+topic 51 | card memory windows board ram bus drivers driver cpu problem
+topic 52 | window application manager display button xterm path widget event resources
+topic 53 | cable win van det bos tor cal nyi chi buf
+topic 54 | americans baltimore rochester cape springfield moncton providence utica binghamton adirondack
+topic 55 | color monitor screen mouse video colors resolution vga colour monitors
+topic 56 | option power ssf flights capability module redesign missions station options
+topic 57 | body father son vitamin diet day cells cell form literature
+topic 58 | max g9v b8f a86 bhj giz bxn biz qax b4q
+topic 59 | bit fast chip ibm faster mode chips scsi-2 speeds quadra
+topic 60 | book books law adl islam islamic iran media bullock muslims
+topic 61 | armenian russian turkish ottoman people army armenians genocide war turks
+topic 62 | oscillator partition tune nun umumiye nezareti mecmuasi muharrerat-i evrak version
+topic 63 | tongues seat est didn raise copied lazy schemes adapter leap
+topic 64 | com object jim app function motorola heterosexual objects pointers encountered
+topic 65 | effective boy projects grow jason ain dump keyboards vastly grants
+topic 66 | armenian people russian armenians turks ottoman army turkish genocide muslim
+topic 67 | mac apple pin ground wire quicktime macs pins connector simms
+topic 68 | bastard turning likes hooks notions turks cited proud pointers chuck
+topic 69 | bought dealer cost channel replaced face sony stereo warranty tube
+topic 70 | myers food reaction msg writes loop eat dee effects taste
+topic 71 | lander contradiction reconcile apparent somebody supplement essential needs produce insulin
+topic 72 | re-boost systems virginia voice unix input ken easily summary developing
+topic 73 | block tests suck shadow dte screws macedonia sunlight fin message
+topic 74 | jesus church christ god lord holy spirit mary shall heaven
+topic 75 | gun number year guns rate insurance police years new firearms
+topic 76 | rule automatically characteristic wider thumb recommendation inline mr2 halfway width
+topic 77 | drive disk hard scsi drives controller floppy ide master transfer
+topic 78 | stephanopoulos water gas oil heat energy hot temperature cold nuclear
+topic 79 | like know does use don just good thanks need want
+topic 80 | starters mlb mov higher signing left accessible argument viola teams
+topic 81 | entry rules info define entries year int printf include contest
+topic 82 | price new sale offer sell condition shipping interested asking prices
+topic 83 | issue germany title magazine german cover race generation origin nazi
+topic 84 | armenian armenians people turkish war said killed children russian turkey
+topic 85 | dos windows software comp library os/2 version microsoft applications code
+topic 86 | probe space launch titan earth cassini orbiter orbit atmosphere mission
+topic 87 | housed throws fills daylight occurring activities adjacent presenting punish occuring
+topic 88 | statement folk raids thor disarmed anatolia polygon inria arrive smehlik
+topic 89 | sound steve pro convert ati ultra fahrenheit orchid hercules blaster
+topic 90 | joke tricky wearing golden trickle seen geneva csh course caesar
+topic 91 | moral objective values morality child defined bank definition wrong different
+topic 92 | files file edu ftp available version server data use sun
+topic 93 | catalog tons seal ordering kawasaki tools fax free ultraviolet packages
+topic 94 | file program error output use section line code command problem
+topic 95 | power ssf module capability option flights redesign missions human station
+topic 96 | just don think know like time did going didn people
+ tm.build(texts)
done.
+1] texts[
"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks."
+1] tm.doc_topics[
array([0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.05935853, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.04939132, 0.00105197,
+ 0.00105197, 0.00105197, 0.04181867, 0.00105197, 0.00105197,
+ 0.00105197, 0.21681858, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.02146013, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.0458702 , 0.02146013, 0.14892628,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.00105197, 0.00105197, 0.00105197,
+ 0.00105197, 0.00105197, 0.13724779, 0.00105197, 0.00105197,
+ 0.00105197, 0.16612722])
+1])] tm.topics[ np.argmax(tm.doc_topics[
'card memory windows board ram bus drivers driver cpu problem'
+'Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' +
+ tm.predict(['the development and manufacturing of advanced rockets and spacecraft for missions ' +
+ 'to and beyond Earth orbit.'])
array([[0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.65009096, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.06185567, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214, 0.00303214, 0.00303214, 0.00303214,
+ 0.00303214, 0.00303214]])
+'Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' +
+ tm.topics[ np.argmax(tm.predict(['the development and manufacturing of advanced rockets and spacecraft for missions ' +
+ 'to and beyond Earth orbit.']))]
'space nasa earth data launch surface solar moon mission planet'
+