diff --git a/Pipfile b/Pipfile index e9ac7b6..be74d81 100644 --- a/Pipfile +++ b/Pipfile @@ -5,11 +5,11 @@ verify_ssl = true [dev-packages] pylint = "*" -yapf = "*" -jupyter = "*" +autopep8 = "*" [packages] requests = "*" +scrapy = "*" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 99567b1..7aef5d7 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f46c14070771fbc32dc703455ed627cf903f2287130e1f48e12b0f05e8a08147" + "sha256": "13a448d78382996dae90e9e369723f00b9e3713ee27e54ce87bc210556c3e577" }, "pipfile-spec": 6, "requires": { @@ -16,6 +16,27 @@ ] }, "default": { + "asn1crypto": { + "hashes": [ + "sha256:2f1adbb7546ed199e3c90ef23ec95c5cf3585bac7d11fb7eb562a3fe89c64e87", + "sha256:9d5c20441baf0cb60a4ac34cc447c6c189024b6b4c6cd7877034f4965c464e49" + ], + "version": "==0.24.0" + }, + "attrs": { + "hashes": [ + "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69", + "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb" + ], + "version": "==18.2.0" + }, + "automat": { + "hashes": [ + "sha256:cbd78b83fa2d81fe2a4d23d258e1661dd7493c9a50ee2f1a5b2cac61c1793b0e", + "sha256:fdccab66b68498af9ecfa1fa43693abe546014dd25cf28543cbe9d1334916a58" + ], + "version": "==0.7.0" + }, "certifi": { "hashes": [ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7", @@ -23,6 +44,43 @@ ], "version": "==2018.11.29" }, + "cffi": { + "hashes": [ + "sha256:151b7eefd035c56b2b2e1eb9963c90c6302dc15fbd8c1c0a83a163ff2c7d7743", + "sha256:1553d1e99f035ace1c0544050622b7bc963374a00c467edafac50ad7bd276aef", + "sha256:1b0493c091a1898f1136e3f4f991a784437fac3673780ff9de3bcf46c80b6b50", + "sha256:2ba8a45822b7aee805ab49abfe7eec16b90587f7f26df20c71dd89e45a97076f", + "sha256:3bb6bd7266598f318063e584378b8e27c67de998a43362e8fce664c54ee52d30", + "sha256:3c85641778460581c42924384f5e68076d724ceac0f267d66c757f7535069c93", + "sha256:3eb6434197633b7748cea30bf0ba9f66727cdce45117a712b29a443943733257", + "sha256:495c5c2d43bf6cebe0178eb3e88f9c4aa48d8934aa6e3cddb865c058da76756b", + "sha256:4c91af6e967c2015729d3e69c2e51d92f9898c330d6a851bf8f121236f3defd3", + "sha256:57b2533356cb2d8fac1555815929f7f5f14d68ac77b085d2326b571310f34f6e", + "sha256:770f3782b31f50b68627e22f91cb182c48c47c02eb405fd689472aa7b7aa16dc", + "sha256:79f9b6f7c46ae1f8ded75f68cf8ad50e5729ed4d590c74840471fc2823457d04", + "sha256:7a33145e04d44ce95bcd71e522b478d282ad0eafaf34fe1ec5bbd73e662f22b6", + "sha256:857959354ae3a6fa3da6651b966d13b0a8bed6bbc87a0de7b38a549db1d2a359", + "sha256:87f37fe5130574ff76c17cab61e7d2538a16f843bb7bca8ebbc4b12de3078596", + "sha256:95d5251e4b5ca00061f9d9f3d6fe537247e145a8524ae9fd30a2f8fbce993b5b", + "sha256:9d1d3e63a4afdc29bd76ce6aa9d58c771cd1599fbba8cf5057e7860b203710dd", + "sha256:a36c5c154f9d42ec176e6e620cb0dd275744aa1d804786a71ac37dc3661a5e95", + "sha256:a6a5cb8809091ec9ac03edde9304b3ad82ad4466333432b16d78ef40e0cce0d5", + "sha256:ae5e35a2c189d397b91034642cb0eab0e346f776ec2eb44a49a459e6615d6e2e", + "sha256:b0f7d4a3df8f06cf49f9f121bead236e328074de6449866515cea4907bbc63d6", + "sha256:b75110fb114fa366b29a027d0c9be3709579602ae111ff61674d28c93606acca", + "sha256:ba5e697569f84b13640c9e193170e89c13c6244c24400fc57e88724ef610cd31", + "sha256:be2a9b390f77fd7676d80bc3cdc4f8edb940d8c198ed2d8c0be1319018c778e1", + "sha256:ca1bd81f40adc59011f58159e4aa6445fc585a32bb8ac9badf7a2c1aa23822f2", + "sha256:d5d8555d9bfc3f02385c1c37e9f998e2011f0db4f90e250e5bc0c0a85a813085", + "sha256:e55e22ac0a30023426564b1059b035973ec82186ddddbac867078435801c7801", + "sha256:e90f17980e6ab0f3c2f3730e56d1fe9bcba1891eeea58966e89d352492cc74f4", + "sha256:ecbb7b01409e9b782df5ded849c178a0aa7c906cf8c5a67368047daab282b184", + "sha256:ed01918d545a38998bfa5902c7c00e0fee90e957ce036a4000a88e3fe2264917", + "sha256:edabd457cd23a02965166026fd9bfd196f4324fe6032e866d0f3bd0301cd486f", + "sha256:fdf1c1dc5bafc32bc5d08b054f94d659422b05aba244d6be4ddc1c72d9aa70fb" + ], + "version": "==1.11.5" + }, "chardet": { "hashes": [ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", @@ -30,167 +88,262 @@ ], "version": "==3.0.4" }, - "idna": { + "constantly": { "hashes": [ - "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", - "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" + "sha256:586372eb92059873e29eba4f9dec8381541b4d3834660707faf8ba59146dfc35", + "sha256:dd2fa9d6b1a51a83f0d7dd76293d734046aa176e384bf6e33b7e44880eb37c5d" ], - "version": "==2.8" + "version": "==15.1.0" }, - "requests": { + "cryptography": { "hashes": [ - "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", - "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" + "sha256:05a6052c6a9f17ff78ba78f8e6eb1d777d25db3b763343a1ae89a7a8670386dd", + "sha256:0eb83a24c650a36f68e31a6d0a70f7ad9c358fa2506dc7b683398b92e354a038", + "sha256:0ff4a3d6ea86aa0c9e06e92a9f986de7ee8231f36c4da1b31c61a7e692ef3378", + "sha256:1699f3e916981df32afdd014fb3164db28cdb61c757029f502cb0a8c29b2fdb3", + "sha256:1b1f136d74f411f587b07c076149c4436a169dc19532e587460d9ced24adcc13", + "sha256:21e63dd20f5e5455e8b34179ac43d95b3fb1ffa54d071fd2ed5d67da82cfe6dc", + "sha256:2454ada8209bbde97065453a6ca488884bbb263e623d35ba183821317a58b46f", + "sha256:3cdc5f7ca057b2214ce4569e01b0f368b3de9d8ee01887557755ccd1c15d9427", + "sha256:418e7a5ec02a7056d3a4f0c0e7ea81df374205f25f4720bb0e84189aa5fd2515", + "sha256:471a097076a7c4ab85561d7fa9a1239bd2ae1f9fd0047520f13d8b340bf3210b", + "sha256:5ecaf9e7db3ca582c6de6229525d35db8a4e59dc3e8a40a331674ed90e658cbf", + "sha256:63b064a074f8dc61be81449796e2c3f4e308b6eba04a241a5c9f2d05e882c681", + "sha256:6afe324dfe6074822ccd56d80420df750e19ac30a4e56c925746c735cf22ae8b", + "sha256:70596e90398574b77929cd87e1ac6e43edd0e29ba01e1365fed9c26bde295aa5", + "sha256:70c2b04e905d3f72e2ba12c58a590817128dfca08949173faa19a42c824efa0b", + "sha256:8908f1db90be48b060888e9c96a0dee9d842765ce9594ff6a23da61086116bb6", + "sha256:af12dfc9874ac27ebe57fc28c8df0e8afa11f2a1025566476b0d50cdb8884f70", + "sha256:b4fc04326b2d259ddd59ed8ea20405d2e695486ab4c5e1e49b025c484845206e", + "sha256:da5b5dda4aa0d5e2b758cc8dfc67f8d4212e88ea9caad5f61ba132f948bab859" ], - "index": "aliyun", - "version": "==2.21.0" + "version": "==2.4.2" }, - "urllib3": { + "cssselect": { "hashes": [ - "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", - "sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22" + "sha256:066d8bc5229af09617e24b3ca4d52f1f9092d9e061931f4184cd572885c23204", + "sha256:3b5103e8789da9e936a68d993b70df732d06b8bb9a337a05ed4eb52c17ef7206" ], - "version": "==1.24.1" - } - }, - "develop": { - "appnope": { + "version": "==1.0.3" + }, + "hyperlink": { "hashes": [ - "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", - "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" + "sha256:98da4218a56b448c7ec7d2655cb339af1f7d751cf541469bb4fc28c4a4245b34", + "sha256:f01b4ff744f14bc5d0a22a6b9f1525ab7d6312cb0ff967f59414bbac52f0a306" ], - "markers": "sys_platform == 'darwin'", - "version": "==0.1.0" + "version": "==18.0.0" }, - "astroid": { + "idna": { "hashes": [ - "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22", - "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e" + "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", + "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" ], - "version": "==2.1.0" + "version": "==2.8" }, - "backcall": { + "incremental": { "hashes": [ - "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", - "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2" + "sha256:717e12246dddf231a349175f48d74d93e2897244939173b01974ab6661406b9f", + "sha256:7b751696aaf36eebfab537e458929e194460051ccad279c72b755a167eebd4b3" ], - "version": "==0.1.0" + "version": "==17.5.0" }, - "bleach": { + "lxml": { "hashes": [ - "sha256:48d39675b80a75f6d1c3bdbffec791cf0bbbab665cf01e20da701c77de278718", - "sha256:73d26f018af5d5adcdabf5c1c974add4361a9c76af215fe32fdec8a6fc5fb9b9" + "sha256:02bc220d61f46e9b9d5a53c361ef95e9f5e1d27171cd461dddb17677ae2289a5", + "sha256:22f253b542a342755f6cfc047fe4d3a296515cf9b542bc6e261af45a80b8caf6", + "sha256:2f31145c7ff665b330919bfa44aacd3a0211a76ca7e7b441039d2a0b0451e415", + "sha256:36720698c29e7a9626a0dc802ef8885f8f0239bfd1689628ecd459a061f2807f", + "sha256:438a1b0203545521f6616132bfe0f4bca86f8a401364008b30e2b26ec408ce85", + "sha256:4815892904c336bbaf73dafd54f45f69f4021c22b5bad7332176bbf4fb830568", + "sha256:5be031b0f15ad63910d8e5038b489d95a79929513b3634ad4babf77100602588", + "sha256:5c93ae37c3c588e829b037fdfbd64a6e40c901d3f93f7beed6d724c44829a3ad", + "sha256:60842230678674cdac4a1cf0f707ef12d75b9a4fc4a565add4f710b5fcf185d5", + "sha256:62939a8bb6758d1bf923aa1c13f0bcfa9bf5b2fc0f5fa917a6e25db5fe0cfa4e", + "sha256:75830c06a62fe7b8fe3bbb5f269f0b308f19f3949ac81cfd40062f47c1455faf", + "sha256:81992565b74332c7c1aff6a913a3e906771aa81c9d0c68c68113cffcae45bc53", + "sha256:8c892fb0ee52c594d9a7751c7d7356056a9682674b92cc1c4dc968ff0f30c52f", + "sha256:9d862e3cf4fc1f2837dedce9c42269c8c76d027e49820a548ac89fdcee1e361f", + "sha256:a623965c086a6e91bb703d4da62dabe59fe88888e82c4117d544e11fd74835d6", + "sha256:a7783ab7f6a508b0510490cef9f857b763d796ba7476d9703f89722928d1e113", + "sha256:aab09fbe8abfa3b9ce62aaf45aca2d28726b1b9ee44871dbe644050a2fff4940", + "sha256:abf181934ac3ef193832fb973fd7f6149b5c531903c2ec0f1220941d73eee601", + "sha256:ae07fa0c115733fce1e9da96a3ac3fa24801742ca17e917e0c79d63a01eeb843", + "sha256:b9c78242219f674ab645ec571c9a95d70f381319a23911941cd2358a8e0521cf", + "sha256:bccb267678b870d9782c3b44d0cefe3ba0e329f9af8c946d32bf3778e7a4f271", + "sha256:c4df4d27f4c93b2cef74579f00b1d3a31a929c7d8023f870c4b476f03a274db4", + "sha256:caf0e50b546bb60dfa99bb18dfa6748458a83131ecdceaf5c071d74907e7e78a", + "sha256:d3266bd3ac59ac4edcd5fa75165dee80b94a3e5c91049df5f7c057ccf097551c", + "sha256:db0d213987bcd4e6d41710fb4532b22315b0d8fb439ff901782234456556aed1", + "sha256:dbbd5cf7690a40a9f0a9325ab480d0fccf46d16b378eefc08e195d84299bfae1", + "sha256:e16e07a0ec3a75b5ee61f2b1003c35696738f937dc8148fbda9fe2147ccb6e61", + "sha256:e175a006725c7faadbe69e791877d09936c0ef2cf49d01b60a6c1efcb0e8be6f", + "sha256:edd9c13a97f6550f9da2236126bb51c092b3b1ce6187f2bd966533ad794bbb5e", + "sha256:fa39ea60d527fbdd94215b5e5552f1c6a912624521093f1384a491a8ad89ad8b" ], - "version": "==3.0.2" + "version": "==4.2.5" }, - "decorator": { + "parsel": { "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + "sha256:493a9214acbdcb4487a084d95344c25e85e90426a67311ea0425dc5df8dc24b9", + "sha256:9ccd82b8a122345601f6f9209e972c0e8c3518a188fcff2d37cb4d7bc570b4b8" ], - "version": "==4.3.0" + "version": "==1.5.1" }, - "defusedxml": { + "pyasn1": { "hashes": [ - "sha256:24d7f2f94f7f3cb6061acb215685e5125fbcdc40a857eff9de22518820b0a4f4", - "sha256:702a91ade2968a82beb0db1e0766a6a273f33d4616a6ce8cde475d8e09853b20" + "sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7", + "sha256:da6b43a8c9ae93bc80e2739efb38cc776ba74a886e3e9318d65fe81a8b8a2c6e" ], - "version": "==0.5.0" + "version": "==0.4.5" }, - "entrypoints": { + "pyasn1-modules": { "hashes": [ - "sha256:10ad569bb245e7e2ba425285b9fa3e8178a0dc92fc53b1e1c553805e15a8825b", - "sha256:d2d587dde06f99545fb13a383d2cd336a8ff1f359c5839ce3a64c917d10c029f" + "sha256:642afdabb681d39f5948fd5477764d94faf17ce40e5691e9998b52815fbb4e71", + "sha256:d14fcb29dabecba3d7b360bf72327c26c385248a5d603cf6be5f566ce999b261" ], "version": "==0.2.3" }, - "ipykernel": { + "pycparser": { "hashes": [ - "sha256:0aeb7ec277ac42cc2b59ae3d08b10909b2ec161dc6908096210527162b53675d", - "sha256:0fc0bf97920d454102168ec2008620066878848fcfca06c22b669696212e292f" + "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" ], - "version": "==5.1.0" + "version": "==2.19" }, - "ipython": { + "pydispatcher": { "hashes": [ - "sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12", - "sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742" + "sha256:5570069e1b1769af1fe481de6dd1d3a388492acddd2cdad7a3bde145615d5caf", + "sha256:5be4a8be12805ef7d712dd9a93284fb8bc53f309867e573f653a72e5fd10e433" ], - "markers": "python_version >= '3.3'", - "version": "==7.2.0" + "version": "==2.0.5" }, - "ipython-genutils": { + "pyhamcrest": { "hashes": [ - "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8", - "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8" + "sha256:6b672c02fdf7470df9674ab82263841ce8333fb143f32f021f6cb26f0e512420", + "sha256:8ffaa0a53da57e89de14ced7185ac746227a8894dbd5a3c718bf05ddbd1d56cd" ], - "version": "==0.2.0" + "version": "==1.9.0" }, - "ipywidgets": { + "pyopenssl": { "hashes": [ - "sha256:0f2b5cde9f272cb49d52f3f0889fdd1a7ae1e74f37b48dac35a83152780d2b7b", - "sha256:a3e224f430163f767047ab9a042fc55adbcab0c24bbe6cf9f306c4f89fdf0ba3" + "sha256:26ff56a6b5ecaf3a2a59f132681e2a80afcc76b4f902f612f518f92c2a1bf854", + "sha256:6488f1423b00f73b7ad5167885312bb0ce410d3312eb212393795b53c8caa580" ], - "version": "==7.4.2" + "version": "==18.0.0" }, - "isort": { + "queuelib": { "hashes": [ - "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", - "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", - "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497" + "sha256:42b413295551bdc24ed9376c1a2cd7d0b1b0fa4746b77b27ca2b797a276a1a17", + "sha256:ff43b5b74b9266f8df4232a8f768dc4d67281a271905e2ed4a3689d4d304cd02" ], - "version": "==4.3.4" + "version": "==1.5.0" }, - "jedi": { + "requests": { "hashes": [ - "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd", - "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191" + "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", + "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b" ], - "version": "==0.13.2" + "index": "aliyun", + "version": "==2.21.0" }, - "jinja2": { + "scrapy": { "hashes": [ - "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd", - "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4" + "sha256:5a398bf6818f87dcc817c919408a195f19ba46414ae12f259119336cfa862bb6", + "sha256:5b9621731e26b0d195ca3e25ab34d559f45b0b906c0a0cc359199f1b6b612184" ], - "version": "==2.10" + "index": "aliyun", + "version": "==1.5.1" }, - "jsonschema": { + "service-identity": { "hashes": [ - "sha256:000e68abd33c972a5248544925a0cae7d1125f9bf6c58280d37546b946769a08", - "sha256:6ff5f3180870836cae40f06fa10419f557208175f13ad7bc26caa77beb1f6e02" + "sha256:001c0707759cb3de7e49c078a7c0c9cd12594161d3bf06b9c254fdcb1a60dc36", + "sha256:0858a54aabc5b459d1aafa8a518ed2081a285087f349fe3e55197989232e2e2d" ], - "version": "==2.6.0" + "version": "==18.1.0" }, - "jupyter": { + "six": { "hashes": [ - "sha256:3e1f86076bbb7c8c207829390305a2b1fe836d471ed54be66a3b8c41e7f46cc7", - "sha256:5b290f93b98ffbc21c0c7e749f054b3267782166d72fa5e3ed1ed4eaf34a2b78", - "sha256:d9dc4b3318f310e34c82951ea5d6683f67bed7def4b259fafbfe4f1beb1d8e5f" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "index": "aliyun", - "version": "==1.0.0" + "version": "==1.12.0" }, - "jupyter-client": { + "twisted": { "hashes": [ - "sha256:b5f9cb06105c1d2d30719db5ffb3ea67da60919fb68deaefa583deccd8813551", - "sha256:c44411eb1463ed77548bc2d5ec0d744c9b81c4a542d9637c7a52824e2121b987" + "sha256:294be2c6bf84ae776df2fc98e7af7d6537e1c5e60a46d33c3ce2a197677da395" ], - "version": "==5.2.4" + "version": "==18.9.0" }, - "jupyter-console": { + "urllib3": { "hashes": [ - "sha256:308ce876354924fb6c540b41d5d6d08acfc946984bf0c97777c1ddcb42e0b2f5", - "sha256:cc80a97a5c389cbd30252ffb5ce7cefd4b66bde98219edd16bf5cb6f84bb3568" + "sha256:61bf29cada3fc2fbefad4fdf059ea4bd1b4a86d2b6d15e1c7c0b582b9752fe39", + "sha256:de9529817c93f27c8ccbfead6985011db27bd0ddfcdb2d86f3f663385c6a9c22" ], - "version": "==6.0.0" + "version": "==1.24.1" }, - "jupyter-core": { + "w3lib": { + "hashes": [ + "sha256:55994787e93b411c2d659068b51b9998d9d0c05e0df188e6daf8f45836e1ea38", + "sha256:aaf7362464532b1036ab0092e2eee78e8fd7b56787baa9ed4967457b083d011b" + ], + "version": "==1.19.0" + }, + "zope.interface": { + "hashes": [ + "sha256:086707e0f413ff8800d9c4bc26e174f7ee4c9c8b0302fbad68d083071822316c", + "sha256:1157b1ec2a1f5bf45668421e3955c60c610e31913cc695b407a574efdbae1f7b", + "sha256:11ebddf765bff3bbe8dbce10c86884d87f90ed66ee410a7e6c392086e2c63d02", + "sha256:14b242d53f6f35c2d07aa2c0e13ccb710392bcd203e1b82a1828d216f6f6b11f", + "sha256:1b3d0dcabc7c90b470e59e38a9acaa361be43b3a6ea644c0063951964717f0e5", + "sha256:20a12ab46a7e72b89ce0671e7d7a6c3c1ca2c2766ac98112f78c5bddaa6e4375", + "sha256:298f82c0ab1b182bd1f34f347ea97dde0fffb9ecf850ecf7f8904b8442a07487", + "sha256:2f6175722da6f23dbfc76c26c241b67b020e1e83ec7fe93c9e5d3dd18667ada2", + "sha256:3b877de633a0f6d81b600624ff9137312d8b1d0f517064dfc39999352ab659f0", + "sha256:4265681e77f5ac5bac0905812b828c9fe1ce80c6f3e3f8574acfb5643aeabc5b", + "sha256:550695c4e7313555549aa1cdb978dc9413d61307531f123558e438871a883d63", + "sha256:5f4d42baed3a14c290a078e2696c5f565501abde1b2f3f1a1c0a94fbf6fbcc39", + "sha256:62dd71dbed8cc6a18379700701d959307823b3b2451bdc018594c48956ace745", + "sha256:7040547e5b882349c0a2cc9b50674b1745db551f330746af434aad4f09fba2cc", + "sha256:7e099fde2cce8b29434684f82977db4e24f0efa8b0508179fce1602d103296a2", + "sha256:7e5c9a5012b2b33e87980cee7d1c82412b2ebabcb5862d53413ba1a2cfde23aa", + "sha256:81295629128f929e73be4ccfdd943a0906e5fe3cdb0d43ff1e5144d16fbb52b1", + "sha256:95cc574b0b83b85be9917d37cd2fad0ce5a0d21b024e1a5804d044aabea636fc", + "sha256:968d5c5702da15c5bf8e4a6e4b67a4d92164e334e9c0b6acf080106678230b98", + "sha256:9e998ba87df77a85c7bed53240a7257afe51a07ee6bc3445a0bf841886da0b97", + "sha256:a0c39e2535a7e9c195af956610dba5a1073071d2d85e9d2e5d789463f63e52ab", + "sha256:a15e75d284178afe529a536b0e8b28b7e107ef39626a7809b4ee64ff3abc9127", + "sha256:a6a6ff82f5f9b9702478035d8f6fb6903885653bff7ec3a1e011edc9b1a7168d", + "sha256:b639f72b95389620c1f881d94739c614d385406ab1d6926a9ffe1c8abbea23fe", + "sha256:bad44274b151d46619a7567010f7cde23a908c6faa84b97598fd2f474a0c6891", + "sha256:bbcef00d09a30948756c5968863316c949d9cedbc7aabac5e8f0ffbdb632e5f1", + "sha256:d788a3999014ddf416f2dc454efa4a5dbeda657c6aba031cf363741273804c6b", + "sha256:eed88ae03e1ef3a75a0e96a55a99d7937ed03e53d0cffc2451c208db445a2966", + "sha256:f99451f3a579e73b5dd58b1b08d1179791d49084371d9a47baad3b22417f0317" + ], + "version": "==4.6.0" + } + }, + "develop": { + "astroid": { + "hashes": [ + "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22", + "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e" + ], + "version": "==2.1.0" + }, + "autopep8": { "hashes": [ - "sha256:927d713ffa616ea11972534411544589976b2493fc7e09ad946e010aa7eb9970", - "sha256:ba70754aa680300306c699790128f6fbd8c306ee5927976cbe48adacf240c0b7" + "sha256:33d2b5325b7e1afb4240814fe982eea3a92ebea712869bfd08b3c0393404248c" + ], + "index": "aliyun", + "version": "==1.4.3" + }, + "isort": { + "hashes": [ + "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af", + "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8", + "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497" ], - "version": "==4.4.0" + "version": "==4.3.4" }, "lazy-object-proxy": { "hashes": [ @@ -226,39 +379,6 @@ ], "version": "==1.3.1" }, - "markupsafe": { - "hashes": [ - "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432", - "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b", - "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9", - "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af", - "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834", - "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd", - "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d", - "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7", - "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b", - "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3", - "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c", - "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2", - "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7", - "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36", - "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1", - "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e", - "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1", - "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c", - "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856", - "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550", - "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492", - "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672", - "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401", - "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6", - "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6", - "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c", - "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd", - "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1" - ], - "version": "==1.1.0" - }, "mccabe": { "hashes": [ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", @@ -266,90 +386,12 @@ ], "version": "==0.6.1" }, - "mistune": { - "hashes": [ - "sha256:59a3429db53c50b5c6bcc8a07f8848cb00d7dc8bdb431a4ab41920d201d4756e", - "sha256:88a1051873018da288eee8538d476dffe1262495144b33ecb586c4ab266bb8d4" - ], - "version": "==0.8.4" - }, - "nbconvert": { - "hashes": [ - "sha256:08d21cf4203fabafd0d09bbd63f06131b411db8ebeede34b0fd4be4548351779", - "sha256:a8a2749f972592aa9250db975304af6b7337f32337e523a2c995cc9e12c07807" - ], - "version": "==5.4.0" - }, - "nbformat": { - "hashes": [ - "sha256:b9a0dbdbd45bb034f4f8893cafd6f652ea08c8c1674ba83f2dc55d3955743b0b", - "sha256:f7494ef0df60766b7cabe0a3651556345a963b74dbc16bc7c18479041170d402" - ], - "version": "==4.4.0" - }, - "notebook": { - "hashes": [ - "sha256:3ab2db8bc10e6edbd264c3c4b800bee276c99818386ee0c146d98d7e6bcf0a67", - "sha256:d908673a4010787625c8952e91a22adf737db031f2aa0793ad92f6558918a74a" - ], - "version": "==5.7.4" - }, - "pandocfilters": { - "hashes": [ - "sha256:b3dd70e169bb5449e6bc6ff96aea89c5eea8c5f6ab5e207fc2f521a2cf4a0da9" - ], - "version": "==1.4.2" - }, - "parso": { - "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" - ], - "version": "==0.3.1" - }, - "pexpect": { - "hashes": [ - "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba", - "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b" - ], - "markers": "sys_platform != 'win32'", - "version": "==4.6.0" - }, - "pickleshare": { + "pycodestyle": { "hashes": [ - "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", - "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" + "sha256:cbc619d09254895b0d12c2c691e237b2e91e9b2ecf5e84c26b35400f93dcfb83", + "sha256:cbfca99bd594a10f674d0cd97a3d802a1fdef635d4361e1a2658de47ed261e3a" ], - "version": "==0.7.5" - }, - "prometheus-client": { - "hashes": [ - "sha256:e8c11ff5ca53de6c3d91e1510500611cafd1d247a937ec6c588a0a7cc3bef93c" - ], - "version": "==0.5.0" - }, - "prompt-toolkit": { - "hashes": [ - "sha256:c1d6aff5252ab2ef391c2fe498ed8c088066f66bc64a8d5c095bbf795d9fec34", - "sha256:d4c47f79b635a0e70b84fdb97ebd9a274203706b1ee5ed44c10da62755cf3ec9", - "sha256:fd17048d8335c1e6d5ee403c3569953ba3eb8555d710bfc548faf0712666ea39" - ], - "version": "==2.0.7" - }, - "ptyprocess": { - "hashes": [ - "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0", - "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f" - ], - "markers": "os_name != 'nt'", - "version": "==0.6.0" - }, - "pygments": { - "hashes": [ - "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", - "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" - ], - "version": "==2.3.1" + "version": "==2.4.0" }, "pylint": { "hashes": [ @@ -359,57 +401,6 @@ "index": "aliyun", "version": "==2.2.2" }, - "python-dateutil": { - "hashes": [ - "sha256:063df5763652e21de43de7d9e00ccf239f953a832941e37be541614732cdfc93", - "sha256:88f9287c0174266bb0d8cedd395cfba9c58e87e5ad86b2ce58859bc11be3cf02" - ], - "version": "==2.7.5" - }, - "pyzmq": { - "hashes": [ - "sha256:25a0715c8f69cf72f67cfe5a68a3f3ed391c67c063d2257bec0fe7fc2c7f08f8", - "sha256:2bab63759632c6b9e0d5bf19cc63c3b01df267d660e0abcf230cf0afaa966349", - "sha256:30ab49d99b24bf0908ebe1cdfa421720bfab6f93174e4883075b7ff38cc555ba", - "sha256:32c7ca9fc547a91e3c26fc6080b6982e46e79819e706eb414dd78f635a65d946", - "sha256:41219ae72b3cc86d97557fe5b1ef5d1adc1057292ec597b50050874a970a39cf", - "sha256:4b8c48a9a13cea8f1f16622f9bd46127108af14cd26150461e3eab71e0de3e46", - "sha256:55724997b4a929c0d01b43c95051318e26ddbae23565018e138ae2dc60187e59", - "sha256:65f0a4afae59d4fc0aad54a917ab599162613a761b760ba167d66cc646ac3786", - "sha256:6f88591a8b246f5c285ee6ce5c1bf4f6bd8464b7f090b1333a446b6240a68d40", - "sha256:75022a4c60dcd8765bb9ca32f6de75a0ec83b0d96e0309dc479f4c7b21f26cb7", - "sha256:76ea493bfab18dcb090d825f3662b5612e2def73dffc196d51a5194b0294a81d", - "sha256:7b60c045b80709e4e3c085bab9b691e71761b44c2b42dbb047b8b498e7bc16b3", - "sha256:8e6af2f736734aef8ed6f278f9f552ec7f37b1a6b98e59b887484a840757f67d", - "sha256:92c5d4be2ab6cd0112807107cc078ed7c5b55d1a5185a43b28e6193b8378eb9b", - "sha256:9ac2298e486524331e26390eac14e4627effd3f8e001d4266ed9d8f1d2d31cce", - "sha256:9ba650f493a9bc1f24feca1d90fce0e5dd41088a252ac9840131dfbdbf3815ca", - "sha256:a3ceee84114d9f5711fa0f4db9c652af0e4636c89eabc9b7f03a3882569dd1ed", - "sha256:a72b82ac1910f2cf61a49139f4974f994984475f771b0faa730839607eeedddf", - "sha256:ab136ac51027e7c484c53138a0fab4a8a51e80d05162eb7b1585583bcfdbad27", - "sha256:c095b224300bcac61e6c445e27f9046981b1ac20d891b2f1714da89d34c637c8", - "sha256:c5cc52d16c06dc2521340d69adda78a8e1031705924e103c0eb8fc8af861d810", - "sha256:d612e9833a89e8177f8c1dc68d7b4ff98d3186cd331acd616b01bbdab67d3a7b", - "sha256:e828376a23c66c6fe90dcea24b4b72cd774f555a6ee94081670872918df87a19", - "sha256:e9767c7ab2eb552796440168d5c6e23a99ecaade08dda16266d43ad461730192", - "sha256:ebf8b800d42d217e4710d1582b0c8bff20cdcb4faad7c7213e52644034300924" - ], - "version": "==17.1.2" - }, - "qtconsole": { - "hashes": [ - "sha256:1ac4a65e81a27b0838330a6d351c2f8435d4013d98a95373e8a41119b2968390", - "sha256:bc1ba15f50c29ed50f1268ad823bb6543be263c18dd093b80495e9df63b003ac" - ], - "version": "==4.4.3" - }, - "send2trash": { - "hashes": [ - "sha256:60001cc07d707fe247c94f74ca6ac0d3255aabcb930529690897ca2a39db28b2", - "sha256:f1691922577b6fa12821234aeb57599d887c4900b9ca537948d2dac34aea888b" - ], - "version": "==1.5.0" - }, "six": { "hashes": [ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", @@ -417,73 +408,11 @@ ], "version": "==1.12.0" }, - "terminado": { - "hashes": [ - "sha256:55abf9ade563b8f9be1f34e4233c7b7bde726059947a593322e8a553cc4c067a", - "sha256:65011551baff97f5414c67018e908110693143cfbaeb16831b743fe7cad8b927" - ], - "version": "==0.8.1" - }, - "testpath": { - "hashes": [ - "sha256:46c89ebb683f473ffe2aab0ed9f12581d4d078308a3cb3765d79c6b2317b0109", - "sha256:b694b3d9288dbd81685c5d2e7140b81365d46c29f5db4bc659de5aa6b98780f8" - ], - "version": "==0.4.2" - }, - "tornado": { - "hashes": [ - "sha256:0662d28b1ca9f67108c7e3b77afabfb9c7e87bde174fbda78186ecedc2499a9d", - "sha256:4e5158d97583502a7e2739951553cbd88a72076f152b4b11b64b9a10c4c49409", - "sha256:732e836008c708de2e89a31cb2fa6c0e5a70cb60492bee6f1ea1047500feaf7f", - "sha256:8154ec22c450df4e06b35f131adc4f2f3a12ec85981a203301d310abf580500f", - "sha256:8e9d728c4579682e837c92fdd98036bd5cdefa1da2aaf6acf26947e6dd0c01c5", - "sha256:d4b3e5329f572f055b587efc57d29bd051589fb5a43ec8898c77a47ec2fa2bbb", - "sha256:e5f2585afccbff22390cddac29849df463b252b711aa2ce7c5f3f342a5b3b444" - ], - "version": "==5.1.1" - }, - "traitlets": { - "hashes": [ - "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835", - "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9" - ], - "version": "==4.3.2" - }, - "wcwidth": { - "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" - ], - "version": "==0.1.7" - }, - "webencodings": { - "hashes": [ - "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", - "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" - ], - "version": "==0.5.1" - }, - "widgetsnbextension": { - "hashes": [ - "sha256:14b2c65f9940c9a7d3b70adbe713dbd38b5ec69724eebaba034d1036cf3d4740", - "sha256:fa618be8435447a017fd1bf2c7ae922d0428056cfc7449f7a8641edf76b48265" - ], - "version": "==3.4.2" - }, "wrapt": { "hashes": [ "sha256:d4d560d479f2c21e1b5443bbd15fe7ec4b37fe7e53d335d3b9b0a7b1226fe3c6" ], "version": "==1.10.11" - }, - "yapf": { - "hashes": [ - "sha256:8aa7f9abdb97b4da4d3227306b88477982daafef0a96cc41639754ca31f46d55", - "sha256:f2df5891481f94ddadfbf8ae8ae499080752cfb06005a31bbb102f3012f8b944" - ], - "index": "aliyun", - "version": "==0.25.0" } } } diff --git a/get_json.py b/get_json_requests.py similarity index 53% rename from get_json.py rename to get_json_requests.py index f54d17f..452221c 100644 --- a/get_json.py +++ b/get_json_requests.py @@ -1,5 +1,6 @@ import json import os +import time import requests @@ -22,13 +23,13 @@ def get_html(self): try: response = requests.get(url, headers=headers) if response.status_code == 200: - return response.text + return response except requests.ConnectionError as e: print(e) - return None + pass - def test(self, text): - result = json.loads(text) + def test(self, response): + result = json.loads(response.text) data = result.get('data') if data: object_list = data.get('object_list') @@ -37,26 +38,38 @@ def test(self, text): else: return True - def write_into_file(self, text): - result = json.dumps( - json.loads(text), indent=4, ensure_ascii=False) - if not os.path.exists(DIST_DIR): - os.makedirs(DIST_DIR) + def write_into_file(self, response): + result = json.dumps(json.loads(response.text), indent=4, ensure_ascii=False) + if not os.path.exists( + os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)): + os.makedirs(os.path.join(os.path.join(DIST_DIR, 'json'), self.kw)) with open( - 'dist/result{0}.json'.format(int(self.start / 24) + 1), + 'dist/json/{0}/{1}.json'.format(self.kw, + int(self.start / 24) + 1), 'w', encoding='utf-8') as f: f.write(result) def main(): - kw = 'correct' - for i in range(0, 360, 24): + print('Enter the keyowrd: ', end='') + kw = input() + # kw = 'taeyeon' + start = time.time() + counter = 0 + for i in range(0, 3600, 24): spider = Spider(kw, start=i) - text = spider.get_html() - items = spider.test(text) - if items: - spider.write_into_file(text) + response = spider.get_html() + contents = spider.test(response) + if contents: + print( + 'Downloading: {0}.json It costs {1}s'.format( + str(i // 24 + 1), str(time.time() - start)),) + spider.write_into_file(response) + counter += 1 + else: + break + print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start))) if __name__ == '__main__': diff --git a/main.py b/main.py deleted file mode 100644 index 223dcc4..0000000 --- a/main.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -import os -from hashlib import md5 - -import requests - -BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -DIST_DIR = os.path.join(BASE_DIR, 'dist') - - -class Spider: - def __init__(self, kw, start=0): - self.kw = kw - self.start = start - - def get_html(self): - url = 'https://www.duitang.com/napi/blog/list/by_search/?kw={0}&type=feed&include_fields=top_comments%2Cis_root%2Csource_link%2Citem%2Cbuyable%2Croot_id%2Cstatus%2Clike_count%2Clike_id%2Csender%2Calbum%2Creply_count%2Cfavorite_blog_id&_type=&start={1}'.format( - self.kw, self.start) - headers = { - 'User-Agent': - 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' - } - try: - response = requests.get(url, headers=headers) - if response.status_code == 200: - return response.text - except requests.ConnectionError as e: - print(e) - return None - - def test(self, text): - result = json.loads(text) - data = result.get('data') - if data: - object_list = data.get('object_list') - if not object_list: - return [] - else: - for item in object_list: - contents = {} - contents['path'] = item.get('photo').get('path') - yield contents - - def write_into_file(self, item): - if not os.path.exists(os.path.join(DIST_DIR, self.kw)): - os.makedirs(os.path.join(DIST_DIR, self.kw)) - try: - image_url = item.get('path') - if 'gif' in image_url: - response = requests.get(image_url) - if response.status_code == 200: - file_path = '{0}/{1}/{2}.{3}'.format( - DIST_DIR, self.kw, - md5(response.content).hexdigest(), 'gif') - if not os.path.exists(file_path): - with open(file_path, 'wb') as f: - f.write(response.content) - else: - print( - 'Already Downloaded', - md5(response.content).hexdigest(), - 'gif', - sep='') - else: - response = requests.get(image_url) - if response.status_code == 200: - file_path = '{0}/{1}/{2}.{3}'.format( - DIST_DIR, self.kw, - md5(response.content).hexdigest(), 'jpg') - if not os.path.exists(file_path): - with open(file_path, 'wb') as f: - f.write(response.content) - else: - print( - 'Already Downloaded', - md5(response.content).hexdigest(), - 'jpg', - sep='') - except requests.ConnectionError: - print('Failed to save image') - - -def main(): - kw = 'taeyeon' - for i in range(0, 3600, 24): - spider = Spider(kw, start=i) - text = spider.get_html() - items = spider.test(text) - if items: - for item in items: - if 'gif' in item.get('path'): - print('Downloading: ' + item['path'][:-5]) - else: - print('Downloading: ' + item['path']) - spider.write_into_file(item) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/spider_requests.py b/spider_requests.py new file mode 100644 index 0000000..1201aff --- /dev/null +++ b/spider_requests.py @@ -0,0 +1,115 @@ +import json +import os +import time +from hashlib import md5 + +import requests +import get_json_requests + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +DIST_DIR = os.path.join(BASE_DIR, 'dist') + + +class Spider(get_json_requests.Spider): + def __init__(self, kw, start=0): + self.kw = kw + self.start = start + + def test(self, response): + result = json.loads(response.text) + data = result.get('data') + if data: + object_list = data.get('object_list') + if not object_list: + return [] + else: + for item in object_list: + contents = {} + photo = item.get('photo') + if photo: + path = photo.get('path') + if path: + contents['path'] = path + yield contents + + def get_html_2(self, content): + try: + url = content.get('path') + if 'gif_jpeg' in url: + response = requests.get(url[:-5]) + if response.status_code == 200: + return ('gif', response) + elif 'png' in url: + response = requests.get(url) + if response.status_code == 200: + return ('png', response) + elif 'jpg' or 'jpeg' in url: + response = requests.get(url) + if response.status_code == 200: + return ('jpg', response) + else: + print('Unknown format.') + pass + except requests.ConnectionError as e: + print(e) + pass + + def write_into_file(self, format, response): + if not os.path.exists(os.path.join(DIST_DIR, self.kw)): + os.makedirs(os.path.join(DIST_DIR, self.kw)) + if format == 'gif': + file_path = '{0}/{1}/{2}.{3}'.format( + DIST_DIR, self.kw, + md5(response.content).hexdigest(), 'gif') + if not os.path.exists(file_path): + with open(file_path, 'wb') as f: + f.write(response.content) + else: + print('Already Downloaded {0}.gif'.format( + md5(response.content).hexdigest())) + elif format == 'png': + file_path = '{0}/{1}/{2}.{3}'.format( + DIST_DIR, self.kw, + md5(response.content).hexdigest(), 'png') + if not os.path.exists(file_path): + with open(file_path, 'wb') as f: + f.write(response.content) + else: + print('Already Downloaded {0}.png'.format( + md5(response.content).hexdigest())) + elif format == 'jpg': + file_path = '{0}/{1}/{2}.{3}'.format( + DIST_DIR, self.kw, + md5(response.content).hexdigest(), 'jpg') + if not os.path.exists(file_path): + with open(file_path, 'wb') as f: + f.write(response.content) + else: + print('Already Downloaded {0}.jpg'.format( + md5(response.content).hexdigest())) + + +def main(): + print('Enter the keyowrd: ', end='') + kw = input() + # kw = 'taeyeon' + start = time.time() + counter = 0 + for i in range(0, 960, 24): + spider = Spider(kw, start=i) + response = spider.get_html() + contents = spider.test(response) + if contents: + for content in contents: + format, response = spider.get_html_2(content) + if format == 'gif': + print('Downloading: {0} It costs {1}s.'.format(content['path'][:-5], time.time() - start)) + else: + print('Downloading: {0} It costs {1}s.'.format(content['path'], time.time() - start)) + counter += 1 + spider.write_into_file(format, response) + print('Get {0}. It costs {1}s'.format(counter, str(time.time() - start))) + + +if __name__ == '__main__': + main() diff --git a/spider_scrapy/scrapy.cfg b/spider_scrapy/scrapy.cfg new file mode 100644 index 0000000..c6e085c --- /dev/null +++ b/spider_scrapy/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = spider_scrapy.settings + +[deploy] +#url = http://localhost:6800/ +project = spider_scrapy diff --git a/spider_scrapy/spider_scrapy/__init__.py b/spider_scrapy/spider_scrapy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/spider_scrapy/spider_scrapy/items.py b/spider_scrapy/spider_scrapy/items.py new file mode 100644 index 0000000..e5b9070 --- /dev/null +++ b/spider_scrapy/spider_scrapy/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class SpiderScrapyItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/spider_scrapy/spider_scrapy/middlewares.py b/spider_scrapy/spider_scrapy/middlewares.py new file mode 100644 index 0000000..67de015 --- /dev/null +++ b/spider_scrapy/spider_scrapy/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class SpiderScrapySpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class SpiderScrapyDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/spider_scrapy/spider_scrapy/pipelines.py b/spider_scrapy/spider_scrapy/pipelines.py new file mode 100644 index 0000000..894bbcc --- /dev/null +++ b/spider_scrapy/spider_scrapy/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class SpiderScrapyPipeline(object): + def process_item(self, item, spider): + return item diff --git a/spider_scrapy/spider_scrapy/settings.py b/spider_scrapy/spider_scrapy/settings.py new file mode 100644 index 0000000..89fbbf0 --- /dev/null +++ b/spider_scrapy/spider_scrapy/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for spider_scrapy project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'spider_scrapy' + +SPIDER_MODULES = ['spider_scrapy.spiders'] +NEWSPIDER_MODULE = 'spider_scrapy.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'spider_scrapy (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'spider_scrapy.middlewares.SpiderScrapySpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'spider_scrapy.middlewares.SpiderScrapyDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'spider_scrapy.pipelines.SpiderScrapyPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/spider_scrapy/spider_scrapy/spiders/__init__.py b/spider_scrapy/spider_scrapy/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/spider_scrapy/spider_scrapy/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders.