diff options
-rw-r--r-- | _assets/500-miles.gif | bin | 0 -> 506856 bytes | |||
-rw-r--r-- | _assets/cloud.gif | bin | 0 -> 42012550 bytes | |||
-rw-r--r-- | _assets/lazy-wheel.jpg | bin | 0 -> 44389 bytes | |||
-rw-r--r-- | _assets/pip-8771.svg | 374 | ||||
-rw-r--r-- | _assets/pip-parallel-dl.pdf | bin | 0 -> 246038 bytes | |||
-rw-r--r-- | _assets/swirl.png | bin | 0 -> 139115 bytes | |||
-rw-r--r-- | blog/gsoc2020/blog20200609.md | 112 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200622.md | 113 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200706.md | 78 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200720.md | 84 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200803.md | 46 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200817.md | 52 | ||||
-rw-r--r-- | blog/gsoc2020/blog20200831.md | 109 |
13 files changed, 968 insertions, 0 deletions
diff --git a/_assets/500-miles.gif b/_assets/500-miles.gif new file mode 100644 index 0000000..b49c7bd --- /dev/null +++ b/_assets/500-miles.gif Binary files differdiff --git a/_assets/cloud.gif b/_assets/cloud.gif new file mode 100644 index 0000000..3633fbe --- /dev/null +++ b/_assets/cloud.gif Binary files differdiff --git a/_assets/lazy-wheel.jpg b/_assets/lazy-wheel.jpg new file mode 100644 index 0000000..f1be565 --- /dev/null +++ b/_assets/lazy-wheel.jpg Binary files differdiff --git a/_assets/pip-8771.svg b/_assets/pip-8771.svg new file mode 100644 index 0000000..e72e372 --- /dev/null +++ b/_assets/pip-8771.svg @@ -0,0 +1,374 @@ +<?xml version="1.0"?> +<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="691.533333306" height="489.99999999999994" font-family="Consolas, Menlo, 'Bitstream Vera Sans Mono', monospace, 'Powerline Symbols'" font-size="14px"> +<style> +<!-- tango theme --> + +.default-text-fill {fill: #cccccc} +.default-bg-fill {fill: #121314} + +.c-0 {fill: #000000} +.c-1 {fill: #cc0000} +.c-2 {fill: #4e9a06} +.c-3 {fill: #c4a000} +.c-4 {fill: #3465a4} +.c-5 {fill: #75507b} +.c-6 {fill: #06989a} +.c-7 {fill: #d3d7cf} +.c-8 {fill: #555753} +.c-9 {fill: #ef2929} +.c-10 {fill: #8ae234} +.c-11 {fill: #fce94f} +.c-12 {fill: #729fcf} +.c-13 {fill: #ad7fa8} +.c-14 {fill: #34e2e2} +.c-15 {fill: #eeeeec} +.c-8, .c-9, .c-10, .c-11, .c-12, .c-13, .c-14, .c-15 {font-weight: bold} + +<!-- 256 colors --> + +.c-16 {fill: #000000} +.c-17 {fill: #00005f} +.c-18 {fill: #000087} +.c-19 {fill: #0000af} +.c-20 {fill: #0000d7} +.c-21 {fill: #0000ff} +.c-22 {fill: #005f00} +.c-23 {fill: #005f5f} +.c-24 {fill: #005f87} +.c-25 {fill: #005faf} +.c-26 {fill: #005fd7} +.c-27 {fill: #005fff} +.c-28 {fill: #008700} +.c-29 {fill: #00875f} +.c-30 {fill: #008787} +.c-31 {fill: #0087af} +.c-32 {fill: #0087d7} +.c-33 {fill: #0087ff} +.c-34 {fill: #00af00} +.c-35 {fill: #00af5f} +.c-36 {fill: #00af87} +.c-37 {fill: #00afaf} +.c-38 {fill: #00afd7} +.c-39 {fill: #00afff} +.c-40 {fill: #00d700} +.c-41 {fill: #00d75f} +.c-42 {fill: #00d787} +.c-43 {fill: #00d7af} +.c-44 {fill: #00d7d7} +.c-45 {fill: #00d7ff} +.c-46 {fill: #00ff00} +.c-47 {fill: #00ff5f} +.c-48 {fill: #00ff87} +.c-49 {fill: #00ffaf} +.c-50 {fill: #00ffd7} +.c-51 {fill: #00ffff} +.c-52 {fill: #5f0000} +.c-53 {fill: #5f005f} +.c-54 {fill: #5f0087} +.c-55 {fill: #5f00af} +.c-56 {fill: #5f00d7} +.c-57 {fill: #5f00ff} +.c-58 {fill: #5f5f00} +.c-59 {fill: #5f5f5f} +.c-60 {fill: #5f5f87} +.c-61 {fill: #5f5faf} +.c-62 {fill: #5f5fd7} +.c-63 {fill: #5f5fff} +.c-64 {fill: #5f8700} +.c-65 {fill: #5f875f} +.c-66 {fill: #5f8787} +.c-67 {fill: #5f87af} +.c-68 {fill: #5f87d7} +.c-69 {fill: #5f87ff} +.c-70 {fill: #5faf00} +.c-71 {fill: #5faf5f} +.c-72 {fill: #5faf87} +.c-73 {fill: #5fafaf} +.c-74 {fill: #5fafd7} +.c-75 {fill: #5fafff} +.c-76 {fill: #5fd700} +.c-77 {fill: #5fd75f} +.c-78 {fill: #5fd787} +.c-79 {fill: #5fd7af} +.c-80 {fill: #5fd7d7} +.c-81 {fill: #5fd7ff} +.c-82 {fill: #5fff00} +.c-83 {fill: #5fff5f} +.c-84 {fill: #5fff87} +.c-85 {fill: #5fffaf} +.c-86 {fill: #5fffd7} +.c-87 {fill: #5fffff} +.c-88 {fill: #870000} +.c-89 {fill: #87005f} +.c-90 {fill: #870087} +.c-91 {fill: #8700af} +.c-92 {fill: #8700d7} +.c-93 {fill: #8700ff} +.c-94 {fill: #875f00} +.c-95 {fill: #875f5f} +.c-96 {fill: #875f87} +.c-97 {fill: #875faf} +.c-98 {fill: #875fd7} +.c-99 {fill: #875fff} +.c-100 {fill: #878700} +.c-101 {fill: #87875f} +.c-102 {fill: #878787} +.c-103 {fill: #8787af} +.c-104 {fill: #8787d7} +.c-105 {fill: #8787ff} +.c-106 {fill: #87af00} +.c-107 {fill: #87af5f} +.c-108 {fill: #87af87} +.c-109 {fill: #87afaf} +.c-110 {fill: #87afd7} +.c-111 {fill: #87afff} +.c-112 {fill: #87d700} +.c-113 {fill: #87d75f} +.c-114 {fill: #87d787} +.c-115 {fill: #87d7af} +.c-116 {fill: #87d7d7} +.c-117 {fill: #87d7ff} +.c-118 {fill: #87ff00} +.c-119 {fill: #87ff5f} +.c-120 {fill: #87ff87} +.c-121 {fill: #87ffaf} +.c-122 {fill: #87ffd7} +.c-123 {fill: #87ffff} +.c-124 {fill: #af0000} +.c-125 {fill: #af005f} +.c-126 {fill: #af0087} +.c-127 {fill: #af00af} +.c-128 {fill: #af00d7} +.c-129 {fill: #af00ff} +.c-130 {fill: #af5f00} +.c-131 {fill: #af5f5f} +.c-132 {fill: #af5f87} +.c-133 {fill: #af5faf} +.c-134 {fill: #af5fd7} +.c-135 {fill: #af5fff} +.c-136 {fill: #af8700} +.c-137 {fill: #af875f} +.c-138 {fill: #af8787} +.c-139 {fill: #af87af} +.c-140 {fill: #af87d7} +.c-141 {fill: #af87ff} +.c-142 {fill: #afaf00} +.c-143 {fill: #afaf5f} +.c-144 {fill: #afaf87} +.c-145 {fill: #afafaf} +.c-146 {fill: #afafd7} +.c-147 {fill: #afafff} +.c-148 {fill: #afd700} +.c-149 {fill: #afd75f} +.c-150 {fill: #afd787} +.c-151 {fill: #afd7af} +.c-152 {fill: #afd7d7} +.c-153 {fill: #afd7ff} +.c-154 {fill: #afff00} +.c-155 {fill: #afff5f} +.c-156 {fill: #afff87} +.c-157 {fill: #afffaf} +.c-158 {fill: #afffd7} +.c-159 {fill: #afffff} +.c-160 {fill: #d70000} +.c-161 {fill: #d7005f} +.c-162 {fill: #d70087} +.c-163 {fill: #d700af} +.c-164 {fill: #d700d7} +.c-165 {fill: #d700ff} +.c-166 {fill: #d75f00} +.c-167 {fill: #d75f5f} +.c-168 {fill: #d75f87} +.c-169 {fill: #d75faf} +.c-170 {fill: #d75fd7} +.c-171 {fill: #d75fff} +.c-172 {fill: #d78700} +.c-173 {fill: #d7875f} +.c-174 {fill: #d78787} +.c-175 {fill: #d787af} +.c-176 {fill: #d787d7} +.c-177 {fill: #d787ff} +.c-178 {fill: #d7af00} +.c-179 {fill: #d7af5f} +.c-180 {fill: #d7af87} +.c-181 {fill: #d7afaf} +.c-182 {fill: #d7afd7} +.c-183 {fill: #d7afff} +.c-184 {fill: #d7d700} +.c-185 {fill: #d7d75f} +.c-186 {fill: #d7d787} +.c-187 {fill: #d7d7af} +.c-188 {fill: #d7d7d7} +.c-189 {fill: #d7d7ff} +.c-190 {fill: #d7ff00} +.c-191 {fill: #d7ff5f} +.c-192 {fill: #d7ff87} +.c-193 {fill: #d7ffaf} +.c-194 {fill: #d7ffd7} +.c-195 {fill: #d7ffff} +.c-196 {fill: #ff0000} +.c-197 {fill: #ff005f} +.c-198 {fill: #ff0087} +.c-199 {fill: #ff00af} +.c-200 {fill: #ff00d7} +.c-201 {fill: #ff00ff} +.c-202 {fill: #ff5f00} +.c-203 {fill: #ff5f5f} +.c-204 {fill: #ff5f87} +.c-205 {fill: #ff5faf} +.c-206 {fill: #ff5fd7} +.c-207 {fill: #ff5fff} +.c-208 {fill: #ff8700} +.c-209 {fill: #ff875f} +.c-210 {fill: #ff8787} +.c-211 {fill: #ff87af} +.c-212 {fill: #ff87d7} +.c-213 {fill: #ff87ff} +.c-214 {fill: #ffaf00} +.c-215 {fill: #ffaf5f} +.c-216 {fill: #ffaf87} +.c-217 {fill: #ffafaf} +.c-218 {fill: #ffafd7} +.c-219 {fill: #ffafff} +.c-220 {fill: #ffd700} +.c-221 {fill: #ffd75f} +.c-222 {fill: #ffd787} +.c-223 {fill: #ffd7af} +.c-224 {fill: #ffd7d7} +.c-225 {fill: #ffd7ff} +.c-226 {fill: #ffff00} +.c-227 {fill: #ffff5f} +.c-228 {fill: #ffff87} +.c-229 {fill: #ffffaf} +.c-230 {fill: #ffffd7} +.c-231 {fill: #ffffff} +.c-232 {fill: #080808} +.c-233 {fill: #121212} +.c-234 {fill: #1c1c1c} +.c-235 {fill: #262626} +.c-236 {fill: #303030} +.c-237 {fill: #3a3a3a} +.c-238 {fill: #444444} +.c-239 {fill: #4e4e4e} +.c-240 {fill: #585858} +.c-241 {fill: #626262} +.c-242 {fill: #6c6c6c} +.c-243 {fill: #767676} +.c-244 {fill: #808080} +.c-245 {fill: #8a8a8a} +.c-246 {fill: #949494} +.c-247 {fill: #9e9e9e} +.c-248 {fill: #a8a8a8} +.c-249 {fill: #b2b2b2} +.c-250 {fill: #bcbcbc} +.c-251 {fill: #c6c6c6} +.c-252 {fill: #d0d0d0} +.c-253 {fill: #dadada} +.c-254 {fill: #e4e4e4} +.c-255 {fill: #eeeeee} + +.br { font-weight: bold } +.it { font-style: italic } +.un { text-decoration: underline } +</style> + +<rect width="100%" height="100%" class="default-bg-fill" rx="4" ry="4" /> + +<svg x="1.220%" y="2.000%" class="default-text-fill"> +<g style="shape-rendering: optimizeSpeed"> +<rect x="31.707%" y="4.000%" width="1.220%" height="19.7" class="c-7" /> +</g> + +<text class="default-text-fill"> +<tspan y="0.000%"> +<tspan dy="1em" x="0.000%" class="br c-10">c</tspan><tspan x="1.220%" class="br c-10">n</tspan><tspan x="2.439%" class="br c-10">x</tspan><tspan x="3.659%" class="br c-10">@</tspan><tspan x="4.878%" class="br c-10">d</tspan><tspan x="6.098%" class="br c-10">e</tspan><tspan x="7.317%" class="br c-10">b</tspan><tspan x="8.537%" class="br c-10">i</tspan><tspan x="9.756%" class="br c-10">a</tspan><tspan x="10.976%" class="br c-10">n</tspan><tspan x="12.195%">:</tspan><tspan x="13.415%" class="br c-12">~</tspan><tspan x="14.634%">$</tspan><tspan x="17.073%">s</tspan><tspan x="18.293%">o</tspan><tspan x="19.512%">u</tspan><tspan x="20.732%">r</tspan><tspan x="21.951%">c</tspan><tspan x="23.171%">e</tspan><tspan x="25.610%">/</tspan><tspan x="26.829%">t</tspan><tspan x="28.049%">m</tspan><tspan x="29.268%">p</tspan><tspan x="30.488%">/</tspan><tspan x="31.707%">f</tspan><tspan x="32.927%">a</tspan><tspan x="34.146%">s</tspan><tspan x="35.366%">t</tspan><tspan x="36.585%">-</tspan><tspan x="37.805%">d</tspan><tspan x="39.024%">e</tspan><tspan x="40.244%">p</tspan><tspan x="41.463%">s</tspan><tspan x="42.683%">/</tspan><tspan x="43.902%">b</tspan><tspan x="45.122%">i</tspan><tspan x="46.341%">n</tspan><tspan x="47.561%">/</tspan><tspan x="48.780%">a</tspan><tspan x="50.000%">c</tspan><tspan x="51.220%">t</tspan><tspan x="52.439%">i</tspan><tspan x="53.659%">v</tspan><tspan x="54.878%">a</tspan><tspan x="56.098%">t</tspan><tspan x="57.317%">e</tspan> +</tspan> +<tspan y="4.000%"> +<tspan dy="1em" x="0.000%">(</tspan><tspan x="1.220%">f</tspan><tspan x="2.439%">a</tspan><tspan x="3.659%">s</tspan><tspan x="4.878%">t</tspan><tspan x="6.098%">-</tspan><tspan x="7.317%">d</tspan><tspan x="8.537%">e</tspan><tspan x="9.756%">p</tspan><tspan x="10.976%">s</tspan><tspan x="12.195%">)</tspan><tspan x="14.634%" class="br c-10">c</tspan><tspan x="15.854%" class="br c-10">n</tspan><tspan x="17.073%" class="br c-10">x</tspan><tspan x="18.293%" class="br c-10">@</tspan><tspan x="19.512%" class="br c-10">d</tspan><tspan x="20.732%" class="br c-10">e</tspan><tspan x="21.951%" class="br c-10">b</tspan><tspan x="23.171%" class="br c-10">i</tspan><tspan x="24.390%" class="br c-10">a</tspan><tspan x="25.610%" class="br c-10">n</tspan><tspan x="26.829%">:</tspan><tspan x="28.049%" class="br c-12">~</tspan><tspan x="29.268%">$</tspan> +</tspan> +<tspan y="8.000%"> + +</tspan> +<tspan y="12.000%"> + +</tspan> +<tspan y="16.000%"> + +</tspan> +<tspan y="20.000%"> + +</tspan> +<tspan y="24.000%"> + +</tspan> +<tspan y="28.000%"> + +</tspan> +<tspan y="32.000%"> + +</tspan> +<tspan y="36.000%"> + +</tspan> +<tspan y="40.000%"> + +</tspan> +<tspan y="44.000%"> + +</tspan> +<tspan y="48.000%"> + +</tspan> +<tspan y="52.000%"> + +</tspan> +<tspan y="56.000%"> + +</tspan> +<tspan y="60.000%"> + +</tspan> +<tspan y="64.000%"> + +</tspan> +<tspan y="68.000%"> + +</tspan> +<tspan y="72.000%"> + +</tspan> +<tspan y="76.000%"> + +</tspan> +<tspan y="80.000%"> + +</tspan> +<tspan y="84.000%"> + +</tspan> +<tspan y="88.000%"> + +</tspan> +<tspan y="92.000%"> + +</tspan> + +</text> + +<g transform="translate(-50 -50)"> +<svg x="50%" y="50%" width="100" height="100"> +<svg version="1.1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 866.0254037844387 866.0254037844387"> + <defs> + <mask id="small-triangle-mask"> + <rect width="100%" height="100%" fill="white"/> + <polygon points="508.01270189221935 433.01270189221935, 208.0127018922194 259.8076211353316, 208.01270189221927 606.217782649107" fill="black"></polygon> + </mask> + </defs> + <polygon points="808.0127018922194 433.01270189221935, 58.01270189221947 -1.1368683772161603e-13, 58.01270189221913 866.0254037844386" mask="url(#small-triangle-mask)" fill="white"></polygon> + <polyline points="481.2177826491071 333.0127018922194, 134.80762113533166 533.0127018922194" stroke="white" stroke-width="90"></polyline> +</svg> + +</svg> +</g> +</svg> +</svg> diff --git a/_assets/pip-parallel-dl.pdf b/_assets/pip-parallel-dl.pdf new file mode 100644 index 0000000..4bd5898 --- /dev/null +++ b/_assets/pip-parallel-dl.pdf Binary files differdiff --git a/_assets/swirl.png b/_assets/swirl.png new file mode 100644 index 0000000..fdaf0e6 --- /dev/null +++ b/_assets/swirl.png Binary files differdiff --git a/blog/gsoc2020/blog20200609.md b/blog/gsoc2020/blog20200609.md new file mode 100644 index 0000000..b0e6a7b --- /dev/null +++ b/blog/gsoc2020/blog20200609.md @@ -0,0 +1,112 @@ ++++ +rss = "GSoC 2020: Unexpected Things When You're Expecting" +date = Date(2020, 6, 9) ++++ +@def tags = ["pip", "gsoc"] + +# Unexpected Things When You're Expecting + +Hi everyone, I hope that you are all doing well and wishes you all good health! +The last week has not been really kind to me with a decent amount of +academic pressure (my school year is lasting until early Jully). +It would be bold to say that I have spent 10 hours working on my GSoC project +since the last check-in, let alone the 30 hours per week requirement. +That being said, there were still some discoveries that I wish to share. + +\toc + +## The `multiprocessing[.dummy]` wrapper + +Most of the time I spent was to finalize the multi{processing,threading} +wrapper for `map` function that submit tasks to the worker pool. +To my surprise, it is rather difficult to write something that is +not only portable but also easy to read and test. + +By {{pip 8320 "the latest commit"}}, I realized the following: + +1. The `multiprocessing` module was not designed for the implementation + details to be abstracted away entirely. For example, the lazy `map`'s + could be really slow without specifying suitable chunk size + (to cut the input iterable and distribute them to workers in the pool). + By *suitable*, I mean only an order smaller than the input. This defeats + half of the purpose of making it lazy: allowing the input to be + evaluated lazily. Luckily, in the use case I'm aiming for, the length of + the iterable argument is small and the laziness is only needed for the output + (to pipeline download and installation). +2. Mocking `import` for testing purposes can never be pretty. One reason + is that we (Python users) have very little control over the calls of + `import` statements and its lower-level implementation `__import__`. + In order to properly patch this built-in function, unlike for others + of the same group, we have to `monkeypatch` the name from `builtins` + (or `__builtins__` under Python 2) instead of the module that import stuff. + Furthermore, because of the special namespacing, to avoid infinite recursion + we need to alias the function to a different name for fallback. +3. To add to the problem, `multiprocessing` lazily imports the fragile module + during pools creation. Since the failure is platform-specific + (the lack of `sem_open`), it was decided to check upon the import + of the `pip`'s module. Although the behavior is easier to reason + in human language, testing it requires invalidating cached import and + re-import the wrapper module. +4. Last but not least, I now understand the pain of keeping Python 2 + compatibility that many package maintainers still need to deal with + everyday (although Python 2 has reached its end-of-life, `pip`, for + example, {{pip 6148 "will still support it for another year"}}). + +## The change in direction + +Since last week, my mentor Pradyun Gedam and I set up weekly real-time +meeting (a fancy term for video/audio chat in the worldwide quarantine +era) for the entire GSoC period. During the last session, we decided to +put parallelization of download during resolution on hold, in favor of a +more beneficial goal: {{pip 7819 "partially download the wheels during +dependency resolution"}}. + +![](/assets/swirl.png) + +As discussed by Danny McClanahan and the maintainers of `pip`, it is feasible +to only download a few kB of a wheel to obtain enough metadata for +the resolution of dependency. While this is only applicable to wheels +(i.e. prebuilt packages), other packaging format only make up less than 20% +of the downloads (at least on PyPI), and the figure is much less for +the most popular packages. Therefore, this optimization alone could make +[the upcoming backtracking resolver][]'s performance par with the legacy one. + +During the last few years, there has been a lot of effort being poured into +replacing `pip`'s current resolver that is unable to resolve conflicts. +While its correctness will be ensured by some of the most talented and +hard-working developers in the Python packaging community, from the users' +point of view, it would be better to have its performance not lagging +behind the old one. Aside from the increase in CPU cycles for more +rigorous resolution, more I/O, especially networking operations is expected +to be performed. This is due to {{pip 7406#issuecomment-583891169 "the lack +of a standard and efficient way to acquire the metadata"}}. Therefore, unlike +most package managers we are familiar with, `pip` has to fetch +(and possibly build) the packages solely for dependency informations. + +Fortunately, {{pep 427 recommended-archiver-features}} recommends +package builders to place the metadata at the end of the archive. +This allows the resolver to only fetch the last few kB using +`HTTP range requests`_ for the relevant information. +Simply appending `Range: bytes=-8000` to the request header +in `pip._internal.network.download` makes the resolution process +*lightning* fast. Of course this breaks the installation but I am confident +that it is not difficult to implement this optimization cleanly. + +One drawback of this optimization is the compatibility. Not every Python +package index support range requests, and it is not possible to verify +the partial wheel. While the first case is unavoidable, for the other, +hashes checking is usually used for pinned/locked-version requirements, +thus no backtracking is done during dependency resolution. + +Either way, before installation, the packages selected by the resolver +can be downloaded in parallel. This warranties a larger crowd of packages, +compared to parallelization during resolution, where the number of downloads +can be as low as one during trail of different versions of the same package. + +Unfortunately, I have not been able to do much other than +{{pip 8411 "a minor clean up"}}. I am looking forward to accomplishing more +this week and seeing what this path will lead us too! At the moment, +I am happy that I'm able to meet the blog deadline, at least in UTC! + +[the upcoming backtracking resolver]: http://www.ei8fdb.org/thoughts/2020/05/test-pips-alpha-resolver-and-help-us-document-dependency-conflicts +[HTTP range requests]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests diff --git a/blog/gsoc2020/blog20200622.md b/blog/gsoc2020/blog20200622.md new file mode 100644 index 0000000..3bb3a2c --- /dev/null +++ b/blog/gsoc2020/blog20200622.md @@ -0,0 +1,113 @@ ++++ +rss = "GSoC 2020: The Wonderful Wizard of O'zip" +date = Date(2020, 6, 22) ++++ +@def tags = ["pip", "gsoc"] + +# The Wonderful Wizard of O'zip + +> Never give up... No one knows what's going to happen next. + +\toc + +## Preface + +Greetings and best wishes! I had a lot of fun during the last week, +although admittedly nothing was really finished. In summary, +these are the works I carried out in the last seven days: + +* Finilizing {{pip 8320 "utilities for parallelization"}} +* {{pip 8467 "Continuing experimenting"}} + on {{pip 8442 "using lazy wheels or dependency resolution"}} +* Polishing up {{pip 8411 "the patch"}} refactoring + `operations.prepare.prepare_linked_requirement` +* Adding `flake8-logging-format` + {{pip 8423#issuecomment-645418725 "to the linter"}} +* Splitting {{pip 8456 "the linting patch"}} from {{pip 8332 "the PR adding + the license requirement to vendor README"}} + +## The `multiprocessing[.dummy]` wrapper + +Yes, you read it right, this is the same section as last fortnight's blog. +My mentor Pradyun Gedam gave me a green light to have {{pip 8411}} merged +without support for Python 2 and the non-lazy map variant, which turns out +to be troublesome for multithreading. + +The tests still needs to pass of course and the flaky tests (see failing tests +over Azure Pipeline in the past) really gave me a panic attack earlier today. +We probably need to mark them as xfail or investigate why they are +undeterministic specifically on Azure, but the real reason I was *all caught up +and confused* was that the unit tests I added mess with the cached imports +and as `pip`'s tests are run in parallel, who knows what it might affect. +I was so relieved to not discover any new set of tests made flaky by ones +I'm trying to add! + +## The file-like object mapping ZIP over HTTP + +This is where the fun starts. Before we dive in, let's recall some +background information on this. As discovered by Danny McClanahan +in {{pip 7819}}, it is possible to only download a potion of a wheel +and it's still valid for `pip` to get the distribution's metadata. +In the same thread, Daniel Holth suggested that one may use +HTTP range requests to specifically ask for the tail of the wheel, +where the ZIP's central directory record as well as where usually +`dist-info` (the directory containing `METADATA`) can be found. + +Well, *usually*. While {{pep 427}} does indeed recommend + +> Archivers are encouraged to place the `.dist-info` files physically +> at the end of the archive. This enables some potentially interesting +> ZIP tricks including the ability to amend the metadata without +> rewriting the entire archive. + +one of the mentioned *tricks* is adding shared libraries to wheels +of extension modules (using e.g. `auditwheel` or `delocate`). +Thus for non-pure Python wheels, it is unlikely that the metadata +lie in the last few megabytes. Ignoring source distributions is bad enough, +we can't afford making an optimization that doesn't work for extension modules, +which are still an integral part of the Python ecosystem )-: + +But hey, the ZIP's directory record is warrantied to be at the end of the file! +Couldn't we do something about that? The short answer is yes. The long answer +is, well, yessssssss! That, plus magic provided by most operating systems, +this is what we figured out: + +1. We can download a realatively small chunk at the end of the wheel + until it is recognizable as a valid ZIP file. +2. In order for the end of the archive to actually appear as the end to + `zipfile`, we feed to it an object with `seek` and `read` defined. + As navigating to the rear of the file is performed by calling `seek` + with relative offset and `whence=SEEK_END` (see `man 3 fseek` + for more details), we are completely able to make the wheels in the cloud + to behave as if it were available locally. + + ![Wheel in the cloud](/assets/cloud.gif) + +3. For large wheels, it is better to store them in hard disks instead of memory. + For smaller ones, it is also preferable to store it as a file to avoid + (error-prony and often not really efficient) manual tracking and joining + of downloaded segments. We only use a small potion of the wheel, however + just in case one is wonderring, we have very little control over + when `tempfile.SpooledTemporaryFile` rolls over, so the memory-disk hybrid + is not exactly working as expected. +4. With all these in mind, all we have to do is to define an intermediate object + check for local availability and download if needed on calls to `read`, + to lazily provide the data over HTTP and reduce execution time. + +The only theoretical challenge left is to keep track of downloaded intervals, +which I finally figured out after a few trials and errors. The code +was submitted as a pull request to `pip` at {{pip 8467}}. A more modern +(read: Python 3-only) variant was packaged and uploaded to PyPI under +the name of lazip_. I am unaware of any use case for it outside of `pip`, +but it's certainly fun to play with d-: + +## What's next? + +I have been falling short of getting the PRs mention above merged for +quite a while. With `pip`'s next beta coming really soon, I have to somehow +make the patches reach a certain standard and enough attention to be part of +the pre-release—beta-testing would greatly help the success of the GSoC project. +To other GSoC students and mentors reading this, I also hope your projects +to turn out successful! + +[lazip]: https://pypi.org/project/lazip/ diff --git a/blog/gsoc2020/blog20200706.md b/blog/gsoc2020/blog20200706.md new file mode 100644 index 0000000..9c41b31 --- /dev/null +++ b/blog/gsoc2020/blog20200706.md @@ -0,0 +1,78 @@ ++++ +rss = "GSoC 2020: I'm Not Drowning On My Own" +date = Date(2020, 7, 6) ++++ +@def tags = ["pip", "gsoc"] + +# I'm Not Drowning On My Own + +\toc + +## Cold Water + +Hello there! My schoolyear is coming to an end, with some final assignments +and group projects left to be done. I for sure underestimated the workload +of these and in the last (and probably next) few days I'm drowning in work +trying to meet my deadlines. + +One project that might be remotely relevant is [cheese-shop][], which tries to +manage the metadata of packages from the real [Cheese Shop][]. Other than that, +schoolwork is draining a lot of my time and I can't remember the last time +I came up with something new for my GSoC project )-; + +## Warm Water + +On the bright side, I received a lot of help and encouragement +from contributors and stakeholders of `pip`. In the last week alone, +I had five pull requests merged: + +* {{pip 8332}}: Add license requirement to `_vendor/README.rst` +* {{pip 8320}}: Add utilities for parallelization +* {{pip 8504}}: Parallelize `pip list --outdated` and `--uptodate` +* {{pip 8411}}: Refactor `operations.prepare.prepare_linked_requirement` +* {{pip 8467}}: Add utitlity to lazily acquire wheel metadata over HTTP + +In addition to helping me getting my PRs merged, my mentor Pradyun Gedam +also gave me my first official feedback, including what I'm doing right +(and wrong too!) and what I should keep doing to increase the chance of +the project being successful. + +{{pip 7819}}'s roadmap (Danny McClanahan's discoveries and works on lazy wheels) +is being closely tracked by `hatch`'s maintainter Ofek Lev, which really +makes me proud and warms my heart, that what I'm helping build is actually +needed by the community! + +## Learning How To Swim + +With {{pip 8467}} and {{pip 8530}} merged, I'm now working on {{pip 8532}} +which aims to roll out the lazy wheel as the way to obtain +dependency information via the CLI flag `--use-feature=lazy-wheel`. + +{{pip 8532}} was failing initially, despite being relatively trivial and that +the commit it used to base on was passing. Surprisingly, after rebasing it +on top of {{pip 8530}}, it suddenly became green mysteriously. After the first +(early) review, I was able to iterate on my earlier code, which used +the ambiguous exception `RuntimeError`. + +The rest to be done is *just* adding some functional tests (I'm pretty sure +this will be either overwhelming or underwhelming) to make sure that +the command-line flag is working correctly. Hopefully this can make it into +the beta of the upcoming release {{pip 8511 "this month"}}. + +![Lazy wheel](/assets/lazy-wheel.jpg) + +In other news, I've also submitted {{pip 8538 "a patch improving the tests +for the parallelization utilities"}}, which was really messy as I wrote them. +Better late than never! + +Metaphors aside, I actually can't swim d-: + +## Diving Plan + +After {{pip 8532}}, I think I'll try to parallelize downloads of wheels +that are lazily fetched only for metadata. By the current implementation +of the new resolver, for `pip install`, this can be injected directly +between the resolution and build/installation process. + +[cheese-shop]: https://github.com/McSinyx/cheese-shop +[Cheese Shop]: https://pypi.org diff --git a/blog/gsoc2020/blog20200720.md b/blog/gsoc2020/blog20200720.md new file mode 100644 index 0000000..43738a7 --- /dev/null +++ b/blog/gsoc2020/blog20200720.md @@ -0,0 +1,84 @@ ++++ +rss = "GSoC 2020: I've Walked 500 Miles..." +date = Date(2020, 7, 20) ++++ +@def tags = ["pip", "gsoc"] + +# I've Walked 500 Miles... + +> ... and I would walk 500 more\ +> Just to be the man who walks a thousand miles\ +> To fall down at your door +> +> ![500 miles](/assets/500-miles.gif) + +\toc + +## The Main Road + +Hi, have you met `fast-deps`? It's (going to be) the name of `pip`'s +experimental feature that may improve the speed of dependency resolution +of the new resolver. By avoid downloading whole wheels to just +obtain metadata, it is especially helpful when `pip` has to do +heavy backtracking to resolve conflicts. + +Thanks to {{pip 8532#discussion_r453990728 "Chris Hunt's review on GH-8537"}}, +my mentor Pradyun Gedam and I worked out a less hacky approach to inteject +the call to lazy wheel during the resolution process. A new PR {{pip 8588}} +was filed to implement it—I could have *just* worked on top of the old PR +and rebased, but my `git` skill is far from gud enuff to confidently do it. + +Testing this one has been a lot of fun though. At first, integration tests +were added as a rerun of the tests for the new resolver, with an additional flag +to use feature `fast-deps`. It indeed made me feel guilty towards [Travis][], +who has to work around 30 minutes more every run. Per Chris Hunt's suggestion, +in the new PR, I instead write a few functional tests for the area relating +the most to the feature, namely `pip`'s subcommands `wheel`, +`download` and `install`. + +It was also suggested that a mock server with HTTP range requests support +might be better (in term of performance and reliablilty) than for testing. +However, {{pip 8584#issuecomment-659227702 "I have yet to be able to make +Werkzeug do it"}}. + +Why did I say I'm half way there? With the parallel utilities merged and a way +to quickly get the list of distribution to be downloaded being really close, +what left is *only* to figure out a way to properly download them in parallel. +With no distribution to be added during the download progress, the model of this +will fit very well with the architecture in [my original proposal][]. +A batch downloader can be implemented to track the progress of each download +and thus report them cleanly as e.g. progress bar or percentage. This is +the part I am second-most excited about of my GSoC project this summer +(after the synchronization of downloads written in my proposal, which was then +superseded by `fast-deps`) and I can't wait to do it! + +## The Side Quests + +As usual, I make sure that I complete every side quest I see during the journey: + +* {{pip 8568}}: Declare constants in `configuration.py` as such +* {{pip 8571}}: Clean up `Configuration.unset_value` + and nit the class' `__init__` +* {{pip 8578}}: Allow verbose/quite level + to be specified via config file and env var +* {{pip 8599}}: Replace tabs by spaces for consistency + +## Snap Back to Reality + +A bit about me, I actually walked 500 meters earlier today to a bank +and walked 500 more to another to prepare my Visa card for purchasing +the upcoming Pinephone prototype. It's one of the first smartphones +to fully support a GNU/Linux distribution, where one can run desktop apps +(including proper terminals) as well as traditional services like SSH, +HTTP server and IPFS node because why not? Just a few hours ago, +I pre-ordered the [postmarketOS community edition][] with additional hardware +for convergence. + +If you did not come here for a Pinephone ad, please take my apologies though d-; +and to ones reading this, I hope you all can become the person who walks +a thousand miles to fall down at the door opening to all +what you ever wished for! + +[Travis]: https://travis-ci.com +[my original proposal]: /assets/pip-parallel-dl.pdf +[postmarketOS community edition]: https://postmarketos.org/blog/2020/07/15/pinephone-ce-preorder/ diff --git a/blog/gsoc2020/blog20200803.md b/blog/gsoc2020/blog20200803.md new file mode 100644 index 0000000..de2ef8d --- /dev/null +++ b/blog/gsoc2020/blog20200803.md @@ -0,0 +1,46 @@ ++++ +rss = "GSoC 2020: Sorting Things Out" +date = Date(2020, 8, 3) ++++ +@def tags = ["pip", "gsoc"] + +# Sorting Things Out + +Hi! I really hope that everyone reading this is still doing okay, +and if that isn't the case, I wish you a good day! + +## `pip` 20.2 Released! + +Last Wednesday, `pip` 20.2 was released, delivering the `2020-resolver` +as well as many other improvements! I was lucky to be able +to get the `fast-deps` feature to be included as part of the release. +A brief description of this *experimental* feature as well as testing +instruction can be found on [Python Discuss][]. + +The public exposure of the feature also remind me of some further +{{pip 8681 optimization}} to make on {{pip 8670 "the lazy wheel"}}. +Hopefully without download parallelization it would not be too slow +to put off testing by concerned users of `pip`. + +## Preparation for Download Parallelization + +As of this moment, we already have: + +* {{pip 8162#issuecomment-667504162 "Multithreading pool fallback working"}} +* An opt-in to use lazy wheel to optain dependency information, + and thus getting a list of wheels at the end of resolution + ready to be downloaded together + +What's left is *only* to interject a parallel download somewhere after +the dependency resolution step. Still, this struggles me way more than +I've ever imagined. I got so stuck that I had to give myself a day off +in the middle of the week (and study some Rust), then I came up with +{{pip 8638 "something what was agreed upon as difficult to maintain"}}. + +Indeed, a large part of this is my fault, for not communicating the design +thoroughly with `pip`'s maintainers and not carefully noting stuff down +during (verbal) discussions with my mentor. Thankfully {{pip 8685 +"Chris Hunt came to the rescue"}} and did a refactoring that will +make my future work much easier and cleaner. + +[Python Discuss]: https://discuss.python.org/t/announcement-pip-20-2-release/4863/2 diff --git a/blog/gsoc2020/blog20200817.md b/blog/gsoc2020/blog20200817.md new file mode 100644 index 0000000..40caad5 --- /dev/null +++ b/blog/gsoc2020/blog20200817.md @@ -0,0 +1,52 @@ ++++ +rss = "GSoC 2020: Parallelizing Wheel Downloads" +date = Date(2020, 8, 17) ++++ +@def tags = ["pip", "gsoc"] + +# Parallelizing Wheel Downloads + +> And now it's clear as this promise\ +> That we're making\ +> Two progress bars into one + +\toc + +Hello there! It has been raining a lot lately and some mosquito has given me +the Dengue fever today. To whoever reading this, I hope it would never happen +to you. + +Download Parallelization +------------------------ + +I've been working on `pip`'s download parallelization for quite a while now. +As distribution download in `pip` was modeled as a lazily evaluated iterable +of chunks, parallelizing such procedure is as simple as submitting routines +that write files to disk to a worker pool. + +Or at least that is what I thought. + +Progress Reporting UI +--------------------- + +`pip` is currently using customly defined progress reporting classes, +which was not designed to working with multithreading code. Firstly, I want to +try using these instead of defining separate UI for multithreaded progresses. +As they use system signals for termination, one must the progress bars has to be +running the main thread. Or sort of. + +Since the progress bars are designed as iterators, I realized that we +can call `next` on them. So quickly, I throw in some queues and locks, +and prototyped the first *working* {{pip 8771 "implementation of +progress synchronization"}}. + +Performance Issues +------------------ + +Welp, I only said that it works, but I didn't mention the performance, +which is terrible. I am pretty sure that the slow down is with +the synchronization, since the `map_multithread` call doesn't seem +to trigger anything that may introduce any sort of blocking. + +This seems like a lot of fun, and I hope I'll get better tomorrow +to continue playing with it! diff --git a/blog/gsoc2020/blog20200831.md b/blog/gsoc2020/blog20200831.md new file mode 100644 index 0000000..eea0ead --- /dev/null +++ b/blog/gsoc2020/blog20200831.md @@ -0,0 +1,109 @@ ++++ +rss = "GSoC 2020: Outro" +date = Date(2020, 8, 31) ++++ +@def tags = ["pip", "gsoc"] + +# Outro + +> Steamed fish was amazing, matter of fact\ +> Let me get some jerk chicken to go\ +> Grabbed me one of them lemon pie theories\ +> And let me get some of them benchmarks you theories too + +\toc + +## The Look + +At the time of writing, +{{pip 8771 "implementation-wise parallel download is ready"}}: + +[![asciicast](/assets/pip-8771.svg)](https://asciinema.org/a/356704) + +Does this mean I've finished everything just-in-time? This sounds to good +to be true! And how does it perform? Welp... + +## The Benchmark + +Here comes the bad news: under a decent connection to the package index, +using `fast-deps` does not make `pip` faster. For best comparison, +I will time `pip download` on the following cases: + +### Average Distribution + +For convenience purposes, let's refer to the commands to be used as follows + +```console +$ pip --no-cache-dir download {requirement} # legacy-resolver +$ pip --use-feature=2020-resolver \ + --no-cache-dir download {requirement} # 2020-resolver +$ pip --use-feature=2020-resolver --use-feature=fast-deps \ + --no-cache-dir download {requirement} # fast-deps +``` + +In the first test, I used [axuy][] and obtained the following results + +| legacy-resolver | 2020-resolver | fast-deps | +| --------------- | ------------- | --------- | +| 7.709s | 7.888s | 10.993s | +| 7.068s | 7.127s | 11.103s | +| 8.556s | 6.972s | 10.496s | + +Funny enough, running `pip download` with `fast-deps` in a directory +with downloaded files already took around 7-8 seconds. This is because +to lazily download a wheel, `pip` has to {{pip 8670 "make many requests"}} +which are apparently more expensive than actual data transmission on my network. + +@@colbox-blue +With unstable connection to PyPI (for some reason I am not confident enough +to state), this is what I got + +| 2020-resolver | fast-deps | +| ------------- | --------- | +| 1m16.134s | 0m54.894s | +| 1m0.384s | 0m40.753s | +| 0m50.102s | 0m41.988s | + +As the connection was *unstable* and that the majority of `pip` networking +is performed as CI/CD with large and stable bandwidth, I am unsure what this +result is supposed to tell (-; +@@ + +### Large Distribution + +In this test, I used [TensorFlow][] as the requirement and obtained +the following figures: + +| legacy-resolver | 2020-resolver | fast-deps | +| --------------- | ------------- | --------- | +| 0m52.135s | 0m58.809s | 1m5.649s | +| 0m50.641s | 1m14.896s | 1m28.168s | +| 0m49.691s | 1m5.633s | 1m22.131s | + +### Distribution with Conflicting Dependencies + +Some requirement that will trigger a decent amount of backtracking by +the current implementation of the new resolver `oslo-utils==1.4.0`: + +| 2020-resolver | fast-deps | +| ------------- | --------- | +| 14.497s | 24.010s | +| 17.680s | 28.884s | +| 16.541s | 26.333s | + +## What Now? + +I don't know, to be honest. At this point I'm feeling I've failed my own +(and that of other stakeholders of `pip`) expectation and wasted the time +and effort of `pip`'s maintainers reviewing dozens of PRs I've made +in the last three months. + +On the bright side, this has been an opportunity for me to explore the codebase +of package manager and discovered various edge cases where the new resolver +has yet to cover (e.g. I've just noticed that `pip download` would save +to-be-discarded distributions, I'll file an issue on that soon). Plus I got +to know many new and cool people and idea, which make me a more helpful +individual to work on Python packaging in the future, I hope. + +[TensorFlow]: https://www.tensorflow.org +[axuy]: https://www.youtube.com/playlist?list=PLAA9fHINq3sayfxEyZSF2D_rMgDZGyL3N |