Skip to content

Commit

Permalink
better
Browse files Browse the repository at this point in the history
  • Loading branch information
Jinnrry committed Aug 1, 2019
1 parent bb93990 commit b51e710
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 35 deletions.
32 changes: 28 additions & 4 deletions Query.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// 这里是一些常用的查询语句
// 这里是分析数据时用的的查询语句


// 房价均价查询语句

db.lianjia.aggregate([
{'$match': {"address.0": {$exists: true}}},
{
Expand All @@ -29,7 +28,6 @@ db.lianjia.aggregate([


// 平均薪资查询语句

db.zhilian.aggregate([
{'$match': {"workingExp.name": "1-3年"}},
{
Expand Down Expand Up @@ -60,4 +58,30 @@ db.lianjia.aggregate([
{
'$sort': {detailCrawlTime: -1}
}
], {allowDiskUse: true});
], {allowDiskUse: true});


// 租房数据
db.lianjia_zufang.aggregate([
{
$group: {
_id: "$city",
count: {$sum: 1},
avg: {$avg: "$price"},
std: {$stdDevPop: "$price"},
unitPrice: {$avg: {$divide: ["$price", "$mianji"]}}
}
},
{
$project: {
unitPrice: 1, // 单位价格
count: 1, //总数
avg: 1, //每平米均价
std: 1, //标准差
ratio: {$divide: ["$std", "$avg"]} //标准差与均价的比值
}
},
{
'$sort': {count: -1}
}
]);
31 changes: 29 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,40 @@ option支持:lianjia_ershou、zhilian、lianjia_zufang
方便定时脚本记录抓取情况,使用info命令可以输出当前抓取数据量到文件

```
getAwayBSG -info
getAwayBSG -info -info_save_to=./numLog.txt
```

使用-info_save_to参数指定文件保存位置,默认为当前目录的numLog.txt文件中

3.help

输出支持的全部命令列表

```
getAwayBSG -help
```
```


## 数据分析

分析用的MongoDB语句在[Query.js](./Query.js)文件中,使用MongoDB执行即可

## 编译

编译使用xgo,需要先安装docker

```
git clone https://github.com/jiangwei1995910/getAwayBSG
docker pull karalabe/xgo-latest
go get github.com/karalabe/xgo
cd getAwayBSG
sh ./build.sh
```

## 部署

如果需要分布式或者多进程抓取,在不同机器或者多个进程中指定相同的MongoDB源即可,程序已经支持分布式多进程抓取了。已抓取的链接和状态会通过MongoDB共享
4 changes: 0 additions & 4 deletions config-all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1515,10 +1515,6 @@ zlCityList:
url: https://www.zhaopin.com/maoming/
code: 771
pinyin: maoming
- name: 蒙自市
url: https://www.zhaopin.com/mengzishi/
code:
pinyin: mengzishi
- name: 满洲里
url: https://www.zhaopin.com/manzhouli/
code: 10157
Expand Down
4 changes: 0 additions & 4 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1515,10 +1515,6 @@ zlCityList:
# url: https://www.zhaopin.com/maoming/
# code: 771
# pinyin: maoming
# - name: 蒙自市
# url: https://www.zhaopin.com/mengzishi/
# code:
# pinyin: mengzishi
# - name: 满洲里
# url: https://www.zhaopin.com/manzhouli/
# code: 10157
Expand Down
10 changes: 5 additions & 5 deletions entrance/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ import (
"time"
)

func Start_info() {
func Start_info(path string) {

fd, _ := os.OpenFile("./numLog.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
fd, _ := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
fd_time := time.Now().Format("2006-01-02 15:04:05")
fd_content := strings.Join([]string{
fd_time, ":\n",
getLianjiaErShouFangStatus(), "\n",
getLianJiaZuFangStatus(), "\n",
fd_time, ":",
getLianjiaErShouFangStatus(), " ",
getLianJiaZuFangStatus(), " ",
getZhiLianStatus(), "\n",
}, "")
buf := []byte(fd_content)
Expand Down
19 changes: 13 additions & 6 deletions entrance/lianjia.go
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ func crawlDetail() (sucnum int) {
c := colly.NewCollector()
configInfo := configs.Config()

//设置延时
if configInfo["crawlDelay"] != nil {
delay, _ := configInfo["crawlDelay"].(json.Number).Int64()
if delay > 0 {
Expand All @@ -166,6 +167,7 @@ func crawlDetail() (sucnum int) {
}
}

//设置代理
if configInfo["proxyList"] != nil && len(configInfo["proxyList"].([]interface{})) > 0 {
var proxyList []string
for _, v := range configInfo["proxyList"].([]interface{}) {
Expand All @@ -181,8 +183,11 @@ func crawlDetail() (sucnum int) {
}
}

//随机UA
extensions.RandomUserAgent(c)
//自动referer
extensions.Referer(c)
//设置MongoDB存储状态信息
storage := &cachemongo.Storage{
Database: "colly",
URI: configInfo["dburl"].(string) + "/colly",
Expand Down Expand Up @@ -240,6 +245,7 @@ func crawlDetail() (sucnum int) {
odb := client.Database(configInfo["dbDatabase"].(string))
lianjia := odb.Collection(configInfo["dbCollection"].(string))

//读取出全部需要抓取详情的数据
cur, err := lianjia.Find(ctx, bson.M{"detailCrawlTime": bson.M{"$exists": false}})

if err != nil {
Expand All @@ -263,28 +269,29 @@ func crawlDetail() (sucnum int) {
}

func Start_lianjia_ershou() {
listFlag := make(chan int)
detailFlag := make(chan int)
listFlag := make(chan int) //记录列表抓取是否完成
detailFlag := make(chan int) //记录详情是否抓取完成

go func() {
listCrawler()
listFlag <- 1
listFlag <- 1 //列表抓取完成
}()

go func() {
zeroNum := 0
for i := 0; i < 1; i = 0 {
if crawlDetail() == 0 {
zeroNum++
if zeroNum > 3 {
if zeroNum > 3 { //尝试3次都没有详情需要抓取,结束详情抓取
break
}
time.Sleep(300 * time.Second)
time.Sleep(300 * time.Second) //没有详情需要抓取了,等待5分钟再尝试
}
}
detailFlag <- 1
detailFlag <- 1 //详情抓取完成
}()

//详情抓取与列表抓取都完成了,结束主线程
<-listFlag
<-detailFlag
}
34 changes: 26 additions & 8 deletions entrance/lianjia_zufang.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) {
})

c.OnHTML(".content__list--item", func(element *colly.HTMLElement) {

var err error
var link string
var title string
var address string
var area string
var price int
var mianji int
element.ForEach(".twoline a", func(i int, element *colly.HTMLElement) {
link = "https://" + element.Request.URL.Host + element.Attr("href")
title = strings.TrimSpace(element.Text)
Expand All @@ -82,6 +83,22 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) {
}
})

desc := element.ChildText(".content__list--item--des")
desc = strings.ReplaceAll(desc, " ", "")
desc = strings.ReplaceAll(desc, "\n", "")
fmt.Println(desc)
re, _ := regexp.Compile("(\\d+)㎡/")
indexs := re.FindStringIndex(desc)
if len(indexs) == 2 {

mianji, err = strconv.Atoi(desc[indexs[0] : indexs[1]-4])
if err != nil {
mianji = 0
}
} else {
mianji = 0
}

element.ForEach(".content__list--item-price em", func(i int, element *colly.HTMLElement) {
var err error
price, err = strconv.Atoi(element.Text)
Expand All @@ -90,26 +107,27 @@ func TcrawlerOneCityZuFang(cityUrl string, cityname string) {
}
})

fmt.Println(price)
fmt.Println(link)
fmt.Println(title)
fmt.Println(address)
fmt.Println(area)
fmt.Println(cityname)
//fmt.Println(price)
//fmt.Println(link)
//fmt.Println(title)
//fmt.Println(address)
//fmt.Println(area)
//fmt.Println(cityname)
fmt.Println("--------------------")

client := db.GetClient()
ctx := db.GetCtx()

db := client.Database(configInfo["dbDatabase"].(string))
lianjia := db.Collection(configInfo["zufangCollection"].(string))
_, err := lianjia.InsertOne(ctx, bson.M{
_, err = lianjia.InsertOne(ctx, bson.M{
"Link": link,
"title": title,
"address": address,
"area": area,
"price": price,
"city": cityname,
"mianji": mianji,
"crawl_time": time.Now(),
})
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions entrance/zhilian.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ func Start_zhilian() {
var total int = 1000
for start := 0; start < total; start += 50 {
cityid := cityList[j].(map[string]interface{})["code"]
if cityid == nil {
fmt.Println(cityList[j])
}
icityid, err := cityid.(json.Number).Int64()
if err != nil {
icityid = 530
Expand Down
9 changes: 7 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"getAwayBSG/entrance"
)

// 实际中应该用更好的变量名
// 申明配置变量
var (
help bool
config string
Expand All @@ -16,6 +16,7 @@ var (
zhilian bool
clean bool
info bool
infoSaveTo string
)

func init() {
Expand All @@ -26,15 +27,19 @@ func init() {
flag.BoolVar(&zhilian, "zhilian", false, "抓取智联招聘数据")
flag.BoolVar(&clean, "clean", false, "清理缓存")
flag.BoolVar(&info, "info", false, "保存抓取状态")
flag.StringVar(&infoSaveTo, "info_save_to", "./numlog.txt", "输入状态文件保存位置")
}

func main() {
flag.Parse()
//初始化配置信息,同时输出配置信息
if config != "" {
configs.SetConfig(config)
}
fmt.Println(configs.Config())


//进入不同入口
if help {
flag.Usage()
} else if lianjia_ershou {
Expand All @@ -46,7 +51,7 @@ func main() {
} else if clean {
entrance.Start_clean()
} else if info {
entrance.Start_info()
entrance.Start_info(infoSaveTo)
} else {
flag.Usage()
}
Expand Down

0 comments on commit b51e710

Please sign in to comment.