Skip to content
Snippets Groups Projects
Commit ea2a57ca authored by Pedro Folloni Pesserl's avatar Pedro Folloni Pesserl
Browse files

almost works

parent 2223a855
No related branches found
No related tags found
No related merge requests found
...@@ -37,66 +37,49 @@ CSV=$(realpath $CSV) ...@@ -37,66 +37,49 @@ CSV=$(realpath $CSV)
# converter o arquivo xml.gz para o formato csv # converter o arquivo xml.gz para o formato csv
zcat $XML | \ zcat $XML | \
xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \ xgrep -tx "//PMID|//ArticleTitle|//Abstract|//MeshHeadingList" | \
# o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido # o sed acha os campos úteis -- PMID seguido de ArticleTitle seguido
# de Abstract, seguido ou não de MeshHeadingList -- e converte no # de Abstract, seguido ou não de MeshHeadingList -- e converte no
# formato csv. No caso de não haver MeshHeadingList para um dado artigo, # formato csv. No caso de não haver MeshHeadingList para um dado artigo,
# cria um campo vazio (<). # insere uma linha vazia.
sed -En ' sed -En '
/PMID/{ :x; /<PMID/{
s/<PMID Version="//; s/<PMID Version="//;
s/">//; s/">//;
s/<\/PMID>//; s/<\/PMID>/</;
N; x;
n;
/ArticleTitle/{ /<ArticleTitle>/{
s/<ArticleTitle>/</; s/<ArticleTitle>//;
s/<\/ArticleTitle>//; s/ *<\/ArticleTitle>/</;
N; H;
n;
/Abstract/{ /<Abstract>/{
s/<Abstract> *<AbstractText>/</; s/(<Abstract>|<AbstractText[^>]*>) *//g;
s/<\/AbstractText> *<\/Abstract>//; s/ *<\/AbstractText> *(|<CopyrightInformation>)/, /g;
p; s/(,|<\/CopyrightInformation>) *<\/Abstract>/</g;
n; s/(<sup>|<sub>)//g;
s/(<\/sup>|<\/sub>)/, /g;
/MeshHeadingList/{ H;
s/ *<MeshHeading> *<Descriptor[^>]*>//g; x;
s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading>)/, /g; p;
s/<MeshHeadingList>/</; n;
s/(, *|)<\/MeshHeadingList>//;
p;
};
/PMID/{
i <
}
}
}
}' >> $CSV
# converter para o formato csv /<MeshHeadingList>/{
# sed -i ' s/(<\/DescriptorName>|<\/QualifierName>) *(<Qualifier[^>]*>|<\/MeshHeading> *)/, /g;
# /PMID/{ s/(<MeshHeadingList> *|<MeshHeading> *<Descriptor[^>]*>|(, *|)<\/MeshHeadingList>)//g;
# s/<PMID Version="//; p;
# s/">//; d;
# s/<\/PMID>//; };
# }
# /ArticleTitle/{
# s/<ArticleTitle>/</;
# s/<\/ArticleTitle>//;
# }
# /Abstract/{
# s/<Abstract> *<AbstractText>/</;
# s/<\/AbstractText> *<\/Abstract>//;
# }
# /MeshHeadingList/{ i \
# s/ *<MeshHeading> *<Descriptor[^>]*>//g;
# s/\(<\/DescriptorName>\|<\/QualifierName>\) *\(<Qualifier[^>]*>\|<\/MeshHeading>\)/, /g;
# s/<MeshHeadingList>/</;
# s/\(, *\|\)<\/MeshHeadingList>//;
# }' $CSV
bx;
};
bx;
};
bx;
}' | \
awk 'BEGIN {RS=""} {gsub(/<\n/, "<", $0); print $0}' >> $CSV
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment