From 702e12fcedb7dd38e26e6ac33a84b8142dddc9cb Mon Sep 17 00:00:00 2001 From: Oussama Saoudi Date: Wed, 11 Dec 2024 17:13:30 -0800 Subject: [PATCH] Add fix for sv extension (#591) ## What changes are proposed in this pull request? This is a PR fixes the behaviour for CDF on tables with deletion vectors. When computing selection vectors from add/remove dv pairs, the selection vector must be extended with false. When computing selection vectors from a normal dv, we extend the selection vector with true. ## How was this change tested? I create a table which adds and removes deletion vectors whose lengths are shorter than the total number of rows. The test checks: - deleting rows when none were deleted. - deleting rows when some were deleted - restoring some of the rows that have been deleted - restoring some rows and removing another - restoring all rows. All of these rows are less than the size of the table, so they exercise the extension behaviour for selection vectors. --- kernel/src/table_changes/scan.rs | 28 +++++++++++++++++++- kernel/tests/cdf.rs | 20 +++++++++++++- kernel/tests/data/cdf-table-with-dv.tar.zst | Bin 1915 -> 2310 bytes 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/kernel/src/table_changes/scan.rs b/kernel/src/table_changes/scan.rs index 2d26de785..0902bd2d0 100644 --- a/kernel/src/table_changes/scan.rs +++ b/kernel/src/table_changes/scan.rs @@ -290,6 +290,8 @@ fn read_scan_file( physical_to_logical_expr, global_state.logical_schema.clone().into(), ); + // Determine if the scan file was derived from a deletion vector pair + let is_dv_resolved_pair = scan_file.remove_dv.is_some(); let table_root = Url::parse(&global_state.table_root)?; let location = table_root.join(&scan_file.path)?; @@ -314,7 +316,31 @@ fn read_scan_file( // trying to return a captured variable. We're going to reassign `selection_vector` // to `rest` in a moment anyway let mut sv = selection_vector.take(); - let rest = split_vector(sv.as_mut(), len, None); + + // Gets the selection vector for a data batch with length `len`. There are three cases to + // consider: + // 1. A scan file derived from a deletion vector pair getting resolved. + // 2. A scan file that was not the result of a resolved pair, and has a deletion vector. + // 3. A scan file that was not the result of a resolved pair, and has no deletion vector. + // + // # Case 1 + // If the scan file is derived from a deletion vector pair, its selection vector should be + // extended with `false`. Consider a resolved selection vector `[0, 1]`. Only row 1 has + // changed. If there were more rows (for example 4 total), then none of them have changed. + // Hence, the selection vector is extended to become `[0, 1, 0, 0]`. + // + // # Case 2 + // If the scan file has a deletion vector but is unpaired, its selection vector should be + // extended with `true`. Consider a deletion vector with row 1 deleted. This generates a + // selection vector `[1, 0, 1]`. Only row 1 is deleted. Rows 0 and 2 are selected. If there + // are more rows (for example 4), then all the extra rows should be selected. The selection + // vector becomes `[1, 0, 1, 1]`. + // + // # Case 3 + // These scan files are either simple adds, removes, or cdc files. This case is a noop because + // the selection vector is `None`. + let extend = Some(!is_dv_resolved_pair); + let rest = split_vector(sv.as_mut(), len, extend); let result = ScanResult { raw_data: logical, raw_mask: sv, diff --git a/kernel/tests/cdf.rs b/kernel/tests/cdf.rs index c93b22d24..5263df960 100644 --- a/kernel/tests/cdf.rs +++ b/kernel/tests/cdf.rs @@ -54,6 +54,14 @@ fn read_cdf_for_table( #[test] fn cdf_with_deletion_vector() -> Result<(), Box> { let batches = read_cdf_for_table("cdf-table-with-dv", 0, None)?; + // Each commit performs the following: + // 0. Insert 0..=9 + // 1. Remove [0, 9] + // 2. Restore [0, 9] + // 3. Remove [0, 1, 4, 5] + // 4. Restore [1, 4] + // 5. Restore [0, 5] and Remove [3] + // 6. Restore 3 let mut expected = vec![ "+-------+--------------+-----------------+", "| value | _change_type | _commit_version |", @@ -65,13 +73,23 @@ fn cdf_with_deletion_vector() -> Result<(), Box> { "| 4 | insert | 0 |", "| 5 | insert | 0 |", "| 6 | insert | 0 |", - "| 8 | insert | 0 |", "| 7 | insert | 0 |", + "| 8 | insert | 0 |", "| 9 | insert | 0 |", "| 0 | delete | 1 |", "| 9 | delete | 1 |", "| 0 | insert | 2 |", "| 9 | insert | 2 |", + "| 0 | delete | 3 |", + "| 1 | delete | 3 |", + "| 4 | delete | 3 |", + "| 5 | delete | 3 |", + "| 1 | insert | 4 |", + "| 4 | insert | 4 |", + "| 3 | delete | 5 |", + "| 0 | insert | 5 |", + "| 5 | insert | 5 |", + "| 3 | insert | 6 |", "+-------+--------------+-----------------+", ]; sort_lines!(expected); diff --git a/kernel/tests/data/cdf-table-with-dv.tar.zst b/kernel/tests/data/cdf-table-with-dv.tar.zst index aaef3a3eb717943943294828f069c1f5e1bf5339..76c9f09daec69f80001f9e92848847665bcd6a31 100644 GIT binary patch literal 2310 zcmV+h3HkOYwJ-euSnWpuif=R)M-Y*l0{{R3002e+Ag)UzKp0Q}B7Z9A`o|y=mW`J2 zD4NM`vTmC}%K$vMd9NWJUiI$YQc5XBaPK`;QZKN*ZS~1D~2;#jdWDh7fL=E>JC|7|pd_HDg)M?!{^a4tgE@&!ub)~q;YlE94OOetacPGoIpU*ezN;(D{{!)k?L^ zY4LLG+vJ~Ve_t6+YmOYnd+1qnZj*yMoT?DpB)gA$r)ka3m18I8HhC(hPZg=N`|ft< z;B`VAug?T2q4V^@*X`(pqDPMWrPKNEXl;Zdis3eje;}PF`NvUVs6>hyLN@1(91Jb! z@K9{wpx3X9yKpi;@%aA%|1aSGJN`ew{{#3CWI8hc|Cx{f*EqNW zlVs7lO%mCSBF9(2GMpsU+$Py~CpYJvyiNqWZckN#)oN9@8PmOQhh8^sZd&+mljJ={ zBouvL8E&(4@^liXHLpe+G(dgGxcp_<@WG>18%H`qLy3%-+k7G6q4$NNF~yoAj@qNx z$*J0Aht<>>=ZiCK=Qc}PL@cFI35J!>s8rG-IuVUjQa*%AD58^TO-iZ0H^b&OOHM?i z5=x3_v{J%?BATjglBnIMm9O3=N8ypvTB){4^77jqLecA9A9w16f8K?TcOmaE92IVt zk==JUEp0K@YONWD>AKAnxI4)~Z#1nrx9QS%YO1!$Uqq0PQ-$4otHr9-uC=w&n{&P* zN#fG9bY1A6COm;p@R@(QeWufJ+;H&!nTlzfGo@ITDWz0vYH!7C=$S4~Gq*X*ZZ>5g zQ?46?nKTArP1B%^ak_R@mqxL3{G}8I_uiXvhA&pJtLcU546_;ITX8*0HGwb|Z_ zGls3cG^$~GbJkf~Y;i7WwpD9d&8)>1Q>>|kVy%}_dNmK3LMhMsYarJWG83tH*NN$ON5?4a9 z0TqK|1E}IPEfxfXQQ`*t&;V@$>A83}ijoEtKLd2@r2oqVlu9quh_(>|gIuH#Tw!x$ z7Sz+2{z@O{>2=srG~@Ma^p$h4s*YbZ?OrvVu9^rEu~ikmnvDOeN}JThs;co*HR~8v zgT+PrKg;SF2yWy_InqI@XB;({k&^UfQep*0soY>D#`Nz*1oSFv&jZNj0`MOIBAvJ@ z&jY;9&38uA@ZtgIpqtYxmVl=E+}A01Z%52l|Mcuu+*>wax(2ucsR8esIRCAhWhXrc zU~EKi0WR9G*x1T>OJ}!fv32bXXyv=q1AbsJ)gOQzHB9I#5zK^uGs%;gBOBd0!=hqQ z!|}i>9wDe+Z42~&J5(W-`=j?H%s7O}c5*W2>LMnZh)CdJTi>6VLw!bu;TI$PX7Qht zYzhMbbNbi-u5Hx^s5w6Wtu(EBm-RC_K-h~LAR_qzemg)9F3?nVYILC91^|@Z0WKNf z$Op)F01m>4-}f*F%>ZwF0Hhl?2@9XMJ*Q-5+a~k2cMey6J$aw?LJnAu2S8g3;52}* zR*SI%BBH|pqS!D~WET!dD_*lmz#2n(m|o2Z+<4FeU6j>W&aMX9H>JyDB(!BF5El&+ z^0X9MhmyyV8p+s#9e}{{XDa_hDGgNkYU~}N9YFu)EB}D?;UxB*dy59q-Iv{4Im zJ|I3DJBKqsZOJ^V%cA?;fM_3JX`n9C^Z$)}C2#}!WPs)Y!EZo^)&jK^g7^ShJhsk& zDp@0z15^uU7uyd2@~AkIUr3Y{t@Au`;`e=#Jyoz{L2&>d+%3>BQ+OTe|GAe00(gVI g_1R>pel$X_TOUUMuAP-fp9GehO}KIyuj`DA1|I@qumAu6 literal 1915 zcmV->2ZZ=2wJ-euSd}{fiexYpMgWmk2mJs4|9`Lle+R=NsvJPj3NEd0e*w^xQp=4Z zseK_roX1L@TO@%Ai-fHHNfw=5*x9w^f&T!36rAW$K)F6q3UCT92!?sqB0v|AlMnF(3&{jai1{07F z1A1RVpdpcl5C!bqJG)>a97H1^C!r4!28+7|Y_?UdUTZOXxzGJ*8fPi!G2Ek7H5bb@ zoYkyak5hwLkGSLvB2bAJEnrM=;TeqUkIbZQDNhsAqG@JqEWZk4z};22`-+p%D73+>*6Jox-l&IQT4of>V@A=mtkuuF(bTKuSG(9& zi{o%+EzY>jTH2OvS;g48m#ejU3+P$d;RufwOL==t;77s&XDROtdNeGvn6<2P^rURu zH+l?5dgl-Z8&82qrBL1frPFhFufz{sV+n5Z~39ge$(ld0UDK!6R$-95mDlTpCoaQ6Z3F2LP4+&zH10}v5l zHZbn)m=1T(G{D?BDgl*O9aRT`j1x6jR4J8Ld1X&R88BzLd87FYO(9s0AebLjAeBlz zzT4Hd56-@OkrWXMX(V#tBs3z4q=-mFqYNe(6v?lp-J{VRp)9@VLsIgn+z!+mCxM2n#XP=lo2F8V~~6_C6$_!rK$ z4P>qTa2&SXn`7Rn9>dXJ|J|?bY(A=c-BOPFd=$57+Qr!34BNYbGOlWutJ=LI{M}{R#*Q>Mp`DnNF@szI3 z9Y@r42PW)_1$_LF5>mYy4AKNr+H>+}07^_ncn z?_PuDh@HsLG&9K;qSI4r3wh@dEtWE=u}5`(rl-(Emfv3>RJupWF%?S?&+|N#O81_3 zz2|lBY4c^D$ne}Kf2GJW+-CXLZ=JH78Vp;?bLa6aO@BQo+Nj>&Js3W!`@C~V-g+GI z7l=nyx<~by?yN1Ge{nlJJCNb{m)1Rp6EYkh)xB7L1@}uRuZwb+4D3YSn zZA+^p>p`Rc1b>F~=xgm4Rb`!4678fT=h7wV-pF0q+Ow6Weff z15r8wlu8GM_BYyLS<$+29r_F{=;*bdB?C85Sl@;XYekO@9OVAw<<#HdALij5Yu-(2lM|XJG~wAFv&3!qjEI*BoTkx8B@_Q<9eD%Cntn$VY{X zDKF}b{O#LO`{5Jb1T=AZ;<@H{zZ#b?x!NEBgtiv)%q4$DO4%P`MtBx?khV0kW~d2aH_1Xu>t3}8VoRE7Z{GUp*s+X41iVL&H+9@07YKu z_aV~!XMlkR__cA5qU=X7=N@nt0+}{fL%;INAU{XXsR83Wz|^(CnZHa9VD0q)ihSgN zKI8QZ2@?XOr@m^Y8RJb@v%EWBHY=t+MYgDd zU1=8vn7X|MhA{O4oW5$do8RTA^kvAVpD%x-k8v%E&cEB