Difference between revisions of "Python Data Mining"
Jump to navigation
Jump to search
(Created page with "==Analyze csv files using pandas== import pandas as pd df=pd.read_csv('filename.csv) df.shape # prints the number of columns and rows df.info() # prints out the names and...") |
|||
(2 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
==Analyze csv files using pandas== | ==Analyze csv files using pandas== | ||
<pre> | |||
import pandas as pd | |||
df = pd.read_csv('survey_results_public.csv') | |||
schema_df = pd.read_csv('survey_results_schema.csv') | |||
pd.set_option('display.max_columns', 85) | |||
pd.set_option('display.max_rows', 85) | |||
df[['OpenSourcer', 'Employment']].tail(2) | |||
df.shape # prints out the number of rows and columns | |||
schema_df.tail(2) | |||
df.columns # gives you the name of the columns | |||
df.iloc[[0,1]] # returns the first and 2nd records with all columns | |||
# inner [[0,1]]brackets are rows and the [[],1] | |||
df.iloc[0:3,[5,6,7]] # returns the first and 2nd record with specific columns | |||
# inner [[0,1]]brackets are rows and the [[],1] | |||
#iloc uses index location | |||
#loc uses labels | |||
df.loc[0:10, 'Hobbyist':'Country'] | |||
#loc uses labels | |||
df.loc[0:10, ['Hobbyist','Student','Country']] | |||
# get a count of yes and no responses | |||
df['Hobbyist'].value_counts() | |||
</pre> | |||
==[[#top|Back To Top]] - [[Python|Category]]== | |||
[[Category:Python]] |
Latest revision as of 16:21, 1 September 2020
Analyze csv files using pandas
import pandas as pd df = pd.read_csv('survey_results_public.csv') schema_df = pd.read_csv('survey_results_schema.csv') pd.set_option('display.max_columns', 85) pd.set_option('display.max_rows', 85) df[['OpenSourcer', 'Employment']].tail(2) df.shape # prints out the number of rows and columns schema_df.tail(2) df.columns # gives you the name of the columns df.iloc[[0,1]] # returns the first and 2nd records with all columns # inner [[0,1]]brackets are rows and the [[],1] df.iloc[0:3,[5,6,7]] # returns the first and 2nd record with specific columns # inner [[0,1]]brackets are rows and the [[],1] #iloc uses index location #loc uses labels df.loc[0:10, 'Hobbyist':'Country'] #loc uses labels df.loc[0:10, ['Hobbyist','Student','Country']] # get a count of yes and no responses df['Hobbyist'].value_counts()