data_preview.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. '''
  4. @Auther :liuyuqi.gov@msn.cn
  5. @Time :2018/7/5 0:56
  6. @File :data_preview.py
  7. '''
  8. # 后台做图,不需要GUI需要在头部第一行加入下面两行代码
  9. # %matplotlib inline jupyter中加入这一行
  10. import matplotlib
  11. matplotlib.use('Agg')
  12. # 数据预览
  13. import pandas as pd
  14. import matplotlib.pyplot as plt
  15. from configparser import ConfigParser
  16. # step1: 数据参数初始化
  17. cf = ConfigParser()
  18. config_path = "../conf/config.ini"
  19. section_name = "data_file_name"
  20. cf.read(config_path)
  21. app_interference = cf.get(section_name, "app_interference")
  22. app_resources = cf.get(section_name, "app_resources")
  23. instance_deploy = cf.get(section_name, "instance_deploy")
  24. machine_resources = cf.get(section_name, "machine_resources")
  25. def for_df1():
  26. # 应用app表: 应用id/cpu占用量/内存占用/磁盘占用/P/M/PM等指标
  27. df1 = pd.read_csv(app_resources, header=None,
  28. names=list(["appid", "cpu", "mem", "disk", "P", "M", "PM"]), encoding="utf-8")
  29. print(df1.dtypes)
  30. # appid object
  31. # cpu object
  32. # mem object
  33. # disk int64
  34. # P int64
  35. # M int64
  36. # PM int64
  37. print(df1.shape)
  38. # (9338, 7)
  39. # [5 rows x 7 columns]
  40. # print(df1.head())
  41. # app_3 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0.5 | 0....... 0 0
  42. tmp = df1["cpu"].str.split('|', expand=True).astype('float')
  43. # [5 rows x 98 columns]
  44. df1["cpu"] = tmp.T.mean().T # 转置,求均值,再转置回来,这样求得一行的均值。
  45. tmp = df1["mem"].str.split('|', expand=True).astype('float')
  46. # [5 rows x 98 columns]
  47. df1["mem"] = tmp.T.mean().T # 转置,求均值,再转置回来,这样求得一行的均值。
  48. print(df1.head())
  49. print("总共应用:", df1["appid"].unique().shape)
  50. def for_df2():
  51. # 主机表 :宿主机id/ cpu规格/mem规格/disk规格/P上限/M上限/PM上限
  52. df2 = pd.read_csv(machine_resources, header=None, names=list(
  53. ["machineid", "cpu", "mem", "disk", "P", "M", "PM"]), encoding="utf-8")
  54. # df2 = pd.DataFrame(pd.read_csv("../data/scheduling_preliminary_machine_resources_20180606.csv", header=None),columns=list(["machineid", "cpu", "mem", "disk", "P", "M", "PM"]))
  55. print(df2.dtypes)
  56. # machineid object
  57. # cpu int
  58. # mem int
  59. # disk int64
  60. # P int64
  61. # M int64
  62. # PM int64
  63. print(df2.shape)
  64. # (6000, 7)
  65. print(df2.head())
  66. # machine_3 32 64 600 7 3 7
  67. print("总共主机:", df2["machineid"].unique().shape)
  68. # 6000
  69. # 这里主机主要就两类:
  70. # machine_1 32 64 600 7 3 7 数量:3000
  71. # machine_2 92 288 1024 7 7 9 数量:3000
  72. def for_df3():
  73. # 主机machine/实例instance/应用app 关系表
  74. df3 = pd.read_csv(instance_deploy, header=None,
  75. names=list(["instanceid", "appid", "machineid"]), encoding="utf-8")
  76. print(df3.dtypes)
  77. print("df数据大小:", df3.shape)
  78. print("instance唯一数量:", df3["instanceid"].unique().shape)
  79. # print(df2["instanceid"])
  80. print("总共实例:", df3["instanceid"].unique().shape)
  81. def for_df4():
  82. # 主机和实例表。部署appid1的insterference最多可以部署n个appid2
  83. df4 = pd.read_csv(app_interference, header=None,
  84. names=list(["appid1", "appid2", "max_interference"]), encoding="utf-8")
  85. # 查看数据类型
  86. # print(df.dtypes)
  87. print("df数据大小:", df4.shape)
  88. # 查看头尾部数据
  89. # app_8361 app_2163 0
  90. # app_6585 app_8959 0
  91. # print(df.head())
  92. # print(df.tail())
  93. # 查看索引
  94. # print(df.index)
  95. # 查看所有列标
  96. # print(df.columns)
  97. # 查看所有数据
  98. # print(df.values)
  99. # 第一列
  100. # df[0].groupby()
  101. # 第二列
  102. # 第三列
  103. # 描述性统计
  104. print("数据预览:", df4.describe())
  105. plt.plot(df4["max_interference"])
  106. plt.savefig("../submit/fig1.png")
  107. def seeInstance():
  108. df4 = pd.read_csv("../data/instance.csv", header=None, encoding="utf-8",low_memory=False)
  109. df4.head()
  110. seeInstance()